No OneTemporary
Actions

Size

1 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: projects/clang400-import/contrib/compiler-rt
	===================================================================
	--- projects/clang400-import/contrib/compiler-rt (revision 313642)
	+++ projects/clang400-import/contrib/compiler-rt (revision 313643)

	Property changes on: projects/clang400-import/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist:r313300-313642
	Index: projects/clang400-import/contrib/libc++/include/optional
	===================================================================
	--- projects/clang400-import/contrib/libc++/include/optional (revision 313642)
	+++ projects/clang400-import/contrib/libc++/include/optional (revision 313643)
	@@ -1,1314 +1,1312 @@
	// -- C++ --
	//===-------------------------- optional ----------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_OPTIONAL
	#define _LIBCPP_OPTIONAL

	/*
	optional synopsis

	// C++1z

	namespace std {
	// 20.6.3, optional for object types
	template <class T> class optional;

	// 20.6.4, no-value state indicator
	struct nullopt_t{see below };
	constexpr nullopt_t nullopt(unspecified );

	// 20.6.5, class bad_optional_access
	class bad_optional_access;

	// 20.6.6, relational operators
	template <class T>
	constexpr bool operator==(const optional<T>&, const optional<T>&);
	template <class T>
	constexpr bool operator!=(const optional<T>&, const optional<T>&);
	template <class T>
	constexpr bool operator<(const optional<T>&, const optional<T>&);
	template <class T>
	constexpr bool operator>(const optional<T>&, const optional<T>&);
	template <class T>
	constexpr bool operator<=(const optional<T>&, const optional<T>&);
	template <class T>
	constexpr bool operator>=(const optional<T>&, const optional<T>&);
	template <class T> constexpr bool operator==(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator==(nullopt_t, const optional<T>&) noexcept;
	template <class T> constexpr bool operator!=(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator!=(nullopt_t, const optional<T>&) noexcept;
	template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator<(nullopt_t, const optional<T>&) noexcept;
	template <class T> constexpr bool operator<=(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept;
	template <class T> constexpr bool operator>(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept;
	template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept;
	template <class T> constexpr bool operator>=(nullopt_t, const optional<T>&) noexcept;

	// 20.6.8, comparison with T
	template <class T> constexpr bool operator==(const optional<T>&, const T&);
	template <class T> constexpr bool operator==(const T&, const optional<T>&);
	template <class T> constexpr bool operator!=(const optional<T>&, const T&);
	template <class T> constexpr bool operator!=(const T&, const optional<T>&);
	template <class T> constexpr bool operator<(const optional<T>&, const T&);
	template <class T> constexpr bool operator<(const T&, const optional<T>&);
	template <class T> constexpr bool operator<=(const optional<T>&, const T&);
	template <class T> constexpr bool operator<=(const T&, const optional<T>&);
	template <class T> constexpr bool operator>(const optional<T>&, const T&);
	template <class T> constexpr bool operator>(const T&, const optional<T>&);
	template <class T> constexpr bool operator>=(const optional<T>&, const T&);
	template <class T> constexpr bool operator>=(const T&, const optional<T>&);

	// 20.6.9, specialized algorithms
	template <class T> void swap(optional<T>&, optional<T>&) noexcept(see below );
	template <class T> constexpr optional<see below > make_optional(T&&);
	template <class T, class... Args>
	constexpr optional<T> make_optional(Args&&... args);
	template <class T, class U, class... Args>
	constexpr optional<T> make_optional(initializer_list<U> il, Args&&... args);

	// 20.6.10, hash support
	template <class T> struct hash;
	template <class T> struct hash<optional<T>>;

	template <class T> class optional {
	public:
	using value_type = T;

	// 20.6.3.1, constructors
	constexpr optional() noexcept;
	constexpr optional(nullopt_t) noexcept;
	optional(const optional &);
	optional(optional &&) noexcept(see below );
	template <class... Args> constexpr explicit optional(in_place_t, Args &&...);
	template <class U, class... Args>
	constexpr explicit optional(in_place_t, initializer_list<U>, Args &&...);
	template <class U = T>
	constexpr EXPLICIT optional(U &&);
	template <class U>
	constexpr EXPLICIT optional(const optional<U> &);
	template <class U>
	constexpr EXPLICIT optional(optional<U> &&);

	// 20.6.3.2, destructor
	~optional();

	// 20.6.3.3, assignment
	optional &operator=(nullopt_t) noexcept;
	optional &operator=(const optional &);
	optional &operator=(optional &&) noexcept(see below );
	template <class U = T> optional &operator=(U &&);
	template <class U> optional &operator=(const optional<U> &);
	template <class U> optional &operator=(optional<U> &&);
	template <class... Args> void emplace(Args &&...);
	template <class U, class... Args>
	void emplace(initializer_list<U>, Args &&...);

	// 20.6.3.4, swap
	void swap(optional &) noexcept(see below );

	// 20.6.3.5, observers
	constexpr T const *operator->() const;
	constexpr T *operator->();
	constexpr T const &operator*() const &;
	constexpr T &operator*() &;
	constexpr T &&operator*() &&;
	constexpr const T &&operator*() const &&;
	constexpr explicit operator bool() const noexcept;
	constexpr bool has_value() const noexcept;
	constexpr T const &value() const &;
	constexpr T &value() &;
	constexpr T &&value() &&;
	constexpr const T &&value() const &&;
	template <class U> constexpr T value_or(U &&) const &;
	template <class U> constexpr T value_or(U &&) &&;

	// 20.6.3.6, modifiers
	void reset() noexcept;

	private:
	T *val; // exposition only
	};
	} // namespace std

	*/

	#include <__config>
	#include <__debug>
	#include <__functional_base>
	#include <__undef_min_max>
	#include <functional>
	#include <initializer_list>
	#include <new>
	#include <stdexcept>
	#include <type_traits>
	#include <utility>

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	namespace std // purposefully not using versioning namespace
	{

	class _LIBCPP_EXCEPTION_ABI bad_optional_access
	- : public logic_error
	+ : public exception
	{
	public:
	- _LIBCPP_INLINE_VISIBILITY
	- bad_optional_access() : logic_error("bad optional access") {}
	-
	// Get the key function ~bad_optional_access() into the dylib
	virtual ~bad_optional_access() _NOEXCEPT;
	+ virtual const char* what() const _NOEXCEPT;
	};

	} // std

	#if _LIBCPP_STD_VER > 14

	_LIBCPP_BEGIN_NAMESPACE_STD

	_LIBCPP_NORETURN
	inline _LIBCPP_INLINE_VISIBILITY
	void __throw_bad_optional_access() {
	#ifndef _LIBCPP_NO_EXCEPTIONS
	throw bad_optional_access();
	#else
	_VSTD::abort();
	#endif
	}

	struct nullopt_t
	{
	struct __secret_tag { _LIBCPP_INLINE_VISIBILITY explicit __secret_tag() = default; };
	_LIBCPP_INLINE_VISIBILITY constexpr explicit nullopt_t(__secret_tag, __secret_tag) noexcept {}
	};

	/* inline */ constexpr nullopt_t nullopt{nullopt_t::__secret_tag{}, nullopt_t::__secret_tag{}};

	template <class _Tp, bool = is_trivially_destructible<_Tp>::value>
	struct __optional_destruct_base;

	template <class _Tp>
	struct __optional_destruct_base<_Tp, false>
	{
	typedef _Tp value_type;
	static_assert(is_object_v<value_type>,
	"instantiation of optional with a non-object type is undefined behavior");
	union
	{
	char __null_state_;
	value_type __val_;
	};
	bool __engaged_;

	_LIBCPP_INLINE_VISIBILITY
	~__optional_destruct_base()
	{
	if (__engaged_)
	__val_.~value_type();
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr __optional_destruct_base() noexcept
	: __null_state_(),
	__engaged_(false) {}

	template <class... _Args>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
	: __val_(_VSTD::forward<_Args>(__args)...),
	__engaged_(true) {}

	_LIBCPP_INLINE_VISIBILITY
	void reset() noexcept
	{
	if (__engaged_)
	{
	__val_.~value_type();
	__engaged_ = false;
	}
	}
	};

	template <class _Tp>
	struct __optional_destruct_base<_Tp, true>
	{
	typedef _Tp value_type;
	static_assert(is_object_v<value_type>,
	"instantiation of optional with a non-object type is undefined behavior");
	union
	{
	char __null_state_;
	value_type __val_;
	};
	bool __engaged_;

	_LIBCPP_INLINE_VISIBILITY
	constexpr __optional_destruct_base() noexcept
	: __null_state_(),
	__engaged_(false) {}

	template <class... _Args>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
	: __val_(_VSTD::forward<_Args>(__args)...),
	__engaged_(true) {}

	_LIBCPP_INLINE_VISIBILITY
	void reset() noexcept
	{
	if (__engaged_)
	{
	__engaged_ = false;
	}
	}
	};

	template <class _Tp, bool = is_reference<_Tp>::value>
	struct __optional_storage_base : __optional_destruct_base<_Tp>
	{
	using __base = __optional_destruct_base<_Tp>;
	using value_type = _Tp;
	using __base::__base;

	_LIBCPP_INLINE_VISIBILITY
	constexpr bool has_value() const noexcept
	{
	return this->__engaged_;
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type& __get() & noexcept
	{
	return this->__val_;
	}
	_LIBCPP_INLINE_VISIBILITY
	constexpr const value_type& __get() const& noexcept
	{
	return this->__val_;
	}
	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type&& __get() && noexcept
	{
	return _VSTD::move(this->__val_);
	}
	_LIBCPP_INLINE_VISIBILITY
	constexpr const value_type&& __get() const&& noexcept
	{
	return _VSTD::move(this->__val_);
	}

	template <class... _Args>
	_LIBCPP_INLINE_VISIBILITY
	void __construct(_Args&&... __args)
	{
	_LIBCPP_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
	::new((void*)_VSTD::addressof(this->__val_)) value_type(_VSTD::forward<_Args>(__args)...);
	this->__engaged_ = true;
	}

	template <class _That>
	_LIBCPP_INLINE_VISIBILITY
	void __construct_from(_That&& __opt)
	{
	if (__opt.has_value())
	__construct(_VSTD::forward<_That>(__opt).__get());
	}

	template <class _That>
	_LIBCPP_INLINE_VISIBILITY
	void __assign_from(_That&& __opt)
	{
	if (this->__engaged_ == __opt.has_value())
	{
	if (this->__engaged_)
	this->__val_ = _VSTD::forward<_That>(__opt).__get();
	}
	else
	{
	if (this->__engaged_)
	this->reset();
	else
	__construct(_VSTD::forward<_That>(__opt).__get());
	}
	}
	};

	// optional<T&> is currently required ill-formed, however it may to be in the
	// future. For this reason it has already been implemented to ensure we can
	// make the change in an ABI compatible manner.
	template <class _Tp>
	struct __optional_storage_base<_Tp, true>
	{
	using value_type = _Tp;
	using __raw_type = remove_reference_t<_Tp>;
	__raw_type* __value_;

	template <class _Up>
	static constexpr bool __can_bind_reference() {
	using _RawUp = typename remove_reference<_Up>::type;
	using _UpPtr = _RawUp*;
	using _RawTp = typename remove_reference<_Tp>::type;
	using _TpPtr = _RawTp*;
	using _CheckLValueArg = integral_constant<bool,
	(is_lvalue_reference<_Up>::value && is_convertible<_UpPtr, _TpPtr>::value)
	\|\| is_same<_RawUp, reference_wrapper<_RawTp>>::value
	\|\| is_same<_RawUp, reference_wrapper<typename remove_const<_RawTp>::type>>::value
	>;
	return (is_lvalue_reference<_Tp>::value && _CheckLValueArg::value)
	\|\| (is_rvalue_reference<_Tp>::value && !is_lvalue_reference<_Up>::value &&
	is_convertible<_UpPtr, _TpPtr>::value);
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr __optional_storage_base() noexcept
	: __value_(nullptr) {}

	template <class _UArg>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit __optional_storage_base(in_place_t, _UArg&& __uarg)
	: __value_(_VSTD::addressof(__uarg))
	{
	static_assert(__can_bind_reference<_UArg>(),
	"Attempted to construct a reference element in tuple from a "
	"possible temporary");
	}

	_LIBCPP_INLINE_VISIBILITY
	void reset() noexcept { __value_ = nullptr; }

	_LIBCPP_INLINE_VISIBILITY
	constexpr bool has_value() const noexcept
	{ return __value_ != nullptr; }

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type& __get() const& noexcept
	{ return *__value_; }

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type&& __get() const&& noexcept
	{ return _VSTD::forward<value_type>(*__value_); }

	template <class _UArg>
	_LIBCPP_INLINE_VISIBILITY
	void __construct(_UArg&& __val)
	{
	_LIBCPP_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
	static_assert(__can_bind_reference<_UArg>(),
	"Attempted to construct a reference element in tuple from a "
	"possible temporary");
	__value_ = _VSTD::addressof(__val);
	}

	template <class _That>
	_LIBCPP_INLINE_VISIBILITY
	void __construct_from(_That&& __opt)
	{
	if (__opt.has_value())
	__construct(_VSTD::forward<_That>(__opt).__get());
	}

	template <class _That>
	_LIBCPP_INLINE_VISIBILITY
	void __assign_from(_That&& __opt)
	{
	if (has_value() == __opt.has_value())
	{
	if (has_value())
	*__value_ = _VSTD::forward<_That>(__opt).__get();
	}
	else
	{
	if (has_value())
	reset();
	else
	__construct(_VSTD::forward<_That>(__opt).__get());
	}
	}
	};

	template <class _Tp, bool = is_trivially_copyable<_Tp>::value>
	struct __optional_storage;

	template <class _Tp>
	struct __optional_storage<_Tp, true> : __optional_storage_base<_Tp>
	{
	using __optional_storage_base<_Tp>::__optional_storage_base;
	};

	template <class _Tp>
	struct __optional_storage<_Tp, false> : __optional_storage_base<_Tp>
	{
	using value_type = _Tp;
	using __optional_storage_base<_Tp>::__optional_storage_base;

	_LIBCPP_INLINE_VISIBILITY
	__optional_storage() = default;

	_LIBCPP_INLINE_VISIBILITY
	__optional_storage(const __optional_storage& __opt)
	{
	this->__construct_from(__opt);
	}

	_LIBCPP_INLINE_VISIBILITY
	__optional_storage(__optional_storage&& __opt)
	noexcept(is_nothrow_move_constructible_v<value_type>)
	{
	this->__construct_from(_VSTD::move(__opt));
	}

	_LIBCPP_INLINE_VISIBILITY
	__optional_storage& operator=(const __optional_storage& __opt)
	{
	this->__assign_from(__opt);
	return *this;
	}

	_LIBCPP_INLINE_VISIBILITY
	__optional_storage& operator=(__optional_storage&& __opt)
	noexcept(is_nothrow_move_assignable_v<value_type> &&
	is_nothrow_move_constructible_v<value_type>)
	{
	this->__assign_from(_VSTD::move(__opt));
	return *this;
	}
	};

	template <class _Tp>
	using __optional_sfinae_ctor_base_t = __sfinae_ctor_base<
	is_copy_constructible<_Tp>::value,
	is_move_constructible<_Tp>::value
	>;

	template <class _Tp>
	using __optional_sfinae_assign_base_t = __sfinae_assign_base<
	(is_copy_constructible<_Tp>::value && is_copy_assignable<_Tp>::value),
	(is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value)
	>;

	template <class _Tp>
	class optional
	: private __optional_storage<_Tp>
	, private __optional_sfinae_ctor_base_t<_Tp>
	, private __optional_sfinae_assign_base_t<_Tp>
	{
	using __base = __optional_storage<_Tp>;
	public:
	using value_type = _Tp;

	private:
	// Disable the reference extension using this static assert.
	static_assert(!is_same_v<value_type, in_place_t>,
	"instantiation of optional with in_place_t is ill-formed");
	static_assert(!is_same_v<__uncvref_t<value_type>, nullopt_t>,
	"instantiation of optional with nullopt_t is ill-formed");
	static_assert(!is_reference_v<value_type>,
	"instantiation of optional with a reference type is ill-formed");
	static_assert(is_destructible_v<value_type>,
	"instantiation of optional with a non-destructible type is ill-formed");

	// LWG2756: conditionally explicit conversion from _Up
	struct _CheckOptionalArgsConstructor {
	template <class _Up>
	static constexpr bool __enable_implicit() {
	return is_constructible_v<_Tp, _Up&&> &&
	is_convertible_v<_Up&&, _Tp>;
	}

	template <class _Up>
	static constexpr bool __enable_explicit() {
	return is_constructible_v<_Tp, _Up&&> &&
	!is_convertible_v<_Up&&, _Tp>;
	}
	};
	template <class _Up>
	using _CheckOptionalArgsCtor = conditional_t<
	!is_same_v<in_place_t, _Up> &&
	!is_same_v<decay_t<_Up>, optional>,
	_CheckOptionalArgsConstructor,
	__check_tuple_constructor_fail
	>;
	template <class _QualUp>
	struct _CheckOptionalLikeConstructor {
	template <class _Up, class _Opt = optional<_Up>>
	using __check_constructible_from_opt = __lazy_or<
	is_constructible<_Tp, _Opt&>,
	is_constructible<_Tp, _Opt const&>,
	is_constructible<_Tp, _Opt&&>,
	is_constructible<_Tp, _Opt const&&>,
	is_convertible<_Opt&, _Tp>,
	is_convertible<_Opt const&, _Tp>,
	is_convertible<_Opt&&, _Tp>,
	is_convertible<_Opt const&&, _Tp>
	>;
	template <class _Up, class _Opt = optional<_Up>>
	using __check_assignable_from_opt = __lazy_or<
	is_assignable<_Tp&, _Opt&>,
	is_assignable<_Tp&, _Opt const&>,
	is_assignable<_Tp&, _Opt&&>,
	is_assignable<_Tp&, _Opt const&&>
	>;
	template <class _Up, class _QUp = _QualUp>
	static constexpr bool __enable_implicit() {
	return is_convertible<_QUp, _Tp>::value &&
	!__check_constructible_from_opt<_Up>::value;
	}
	template <class _Up, class _QUp = _QualUp>
	static constexpr bool __enable_explicit() {
	return !is_convertible<_QUp, _Tp>::value &&
	!__check_constructible_from_opt<_Up>::value;
	}
	template <class _Up, class _QUp = _QualUp>
	static constexpr bool __enable_assign() {
	// Construction and assignability of _Qup to _Tp has already been
	// checked.
	return !__check_constructible_from_opt<_Up>::value &&
	!__check_assignable_from_opt<_Up>::value;
	}
	};

	template <class _Up, class _QualUp>
	using _CheckOptionalLikeCtor = conditional_t<
	__lazy_and<
	__lazy_not<is_same<_Up, _Tp>>,
	is_constructible<_Tp, _QualUp>
	>::value,
	_CheckOptionalLikeConstructor<_QualUp>,
	__check_tuple_constructor_fail
	>;
	template <class _Up, class _QualUp>
	using _CheckOptionalLikeAssign = conditional_t<
	__lazy_and<
	__lazy_not<is_same<_Up, _Tp>>,
	is_constructible<_Tp, _QualUp>,
	is_assignable<_Tp&, _QualUp>
	>::value,
	_CheckOptionalLikeConstructor<_QualUp>,
	__check_tuple_constructor_fail
	>;
	public:

	_LIBCPP_INLINE_VISIBILITY constexpr optional() noexcept {}
	_LIBCPP_INLINE_VISIBILITY optional(const optional&) = default;
	_LIBCPP_INLINE_VISIBILITY optional(optional&&) = default;
	_LIBCPP_INLINE_VISIBILITY constexpr optional(nullopt_t) noexcept {}

	template <class... _Args, class = enable_if_t<
	is_constructible_v<value_type, _Args...>>
	>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit optional(in_place_t, _Args&&... __args)
	: __base(in_place, _VSTD::forward<_Args>(__args)...) {}

	template <class _Up, class... _Args, class = enable_if_t<
	is_constructible_v<value_type, initializer_list<_Up>&, _Args...>>
	>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
	: __base(in_place, __il, _VSTD::forward<_Args>(__args)...) {}

	template <class _Up = value_type, enable_if_t<
	_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	constexpr optional(_Up&& __v)
	: __base(in_place, _VSTD::forward<_Up>(__v)) {}

	template <class _Up, enable_if_t<
	_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit optional(_Up&& __v)
	: __base(in_place, _VSTD::forward<_Up>(__v)) {}

	// LWG2756: conditionally explicit conversion from const optional<_Up>&
	template <class _Up, enable_if_t<
	_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	optional(const optional<_Up>& __v)
	{
	this->__construct_from(__v);
	}
	template <class _Up, enable_if_t<
	_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	explicit optional(const optional<_Up>& __v)
	{
	this->__construct_from(__v);
	}

	// LWG2756: conditionally explicit conversion from optional<_Up>&&
	template <class _Up, enable_if_t<
	_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_implicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	optional(optional<_Up>&& __v)
	{
	this->__construct_from(_VSTD::move(__v));
	}
	template <class _Up, enable_if_t<
	_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_explicit<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	explicit optional(optional<_Up>&& __v)
	{
	this->__construct_from(_VSTD::move(__v));
	}

	_LIBCPP_INLINE_VISIBILITY
	optional& operator=(nullopt_t) noexcept
	{
	reset();
	return *this;
	}

	_LIBCPP_INLINE_VISIBILITY optional& operator=(const optional&) = default;
	_LIBCPP_INLINE_VISIBILITY optional& operator=(optional&&) = default;

	// LWG2756
	template <class _Up = value_type,
	class = enable_if_t
	<__lazy_and<
	integral_constant<bool,
	!is_same_v<decay_t<_Up>, optional> &&
	!(is_same_v<_Up, value_type> && is_scalar_v<value_type>)
	>,
	is_constructible<value_type, _Up>,
	is_assignable<value_type&, _Up>
	>::value>
	>
	_LIBCPP_INLINE_VISIBILITY
	optional&
	operator=(_Up&& __v)
	{
	if (this->has_value())
	this->__get() = _VSTD::forward<_Up>(__v);
	else
	this->__construct(_VSTD::forward<_Up>(__v));
	return *this;
	}

	// LWG2756
	template <class _Up, enable_if_t<
	_CheckOptionalLikeAssign<_Up, _Up const&>::template __enable_assign<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	optional&
	operator=(const optional<_Up>& __v)
	{
	this->__assign_from(__v);
	return *this;
	}

	// LWG2756
	template <class _Up, enable_if_t<
	_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_assign<_Up>()
	, int> = 0>
	_LIBCPP_INLINE_VISIBILITY
	optional&
	operator=(optional<_Up>&& __v)
	{
	this->__assign_from(_VSTD::move(__v));
	return *this;
	}

	template <class... _Args,
	class = enable_if_t
	<
	is_constructible_v<value_type, _Args...>
	>
	>
	_LIBCPP_INLINE_VISIBILITY
	void
	emplace(_Args&&... __args)
	{
	reset();
	this->__construct(_VSTD::forward<_Args>(__args)...);
	}

	template <class _Up, class... _Args,
	class = enable_if_t
	<
	is_constructible_v<value_type, initializer_list<_Up>&, _Args...>
	>
	>
	_LIBCPP_INLINE_VISIBILITY
	void
	emplace(initializer_list<_Up> __il, _Args&&... __args)
	{
	reset();
	this->__construct(__il, _VSTD::forward<_Args>(__args)...);
	}

	_LIBCPP_INLINE_VISIBILITY
	void swap(optional& __opt)
	noexcept(is_nothrow_move_constructible_v<value_type> &&
	is_nothrow_swappable_v<value_type>)
	{
	if (this->has_value() == __opt.has_value())
	{
	using _VSTD::swap;
	if (this->has_value())
	swap(this->__get(), __opt.__get());
	}
	else
	{
	if (this->has_value())
	{
	__opt.__construct(_VSTD::move(this->__get()));
	reset();
	}
	else
	{
	this->__construct(_VSTD::move(__opt.__get()));
	__opt.reset();
	}
	}
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	add_pointer_t<value_type const>
	operator->() const
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator-> called for disengaged value");
	#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
	return _VSTD::addressof(this->__get());
	#else
	return __operator_arrow(__has_operator_addressof<value_type>{}, this->__get());
	#endif
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	add_pointer_t<value_type>
	operator->()
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator-> called for disengaged value");
	#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
	return _VSTD::addressof(this->__get());
	#else
	return __operator_arrow(__has_operator_addressof<value_type>{}, this->__get());
	#endif
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	const value_type&
	operator*() const&
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
	return this->__get();
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	value_type&
	operator*() &
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
	return this->__get();
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	value_type&&
	operator*() &&
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
	return _VSTD::move(this->__get());
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr
	const value_type&&
	operator*() const&&
	{
	_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
	return _VSTD::move(this->__get());
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr explicit operator bool() const noexcept { return has_value(); }

	using __base::has_value;
	using __base::__get;

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type const& value() const&
	{
	if (!this->has_value())
	__throw_bad_optional_access();
	return this->__get();
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type& value() &
	{
	if (!this->has_value())
	__throw_bad_optional_access();
	return this->__get();
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type&& value() &&
	{
	if (!this->has_value())
	__throw_bad_optional_access();
	return _VSTD::move(this->__get());
	}

	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type const&& value() const&&
	{
	if (!this->has_value())
	__throw_bad_optional_access();
	return _VSTD::move(this->__get());
	}

	template <class _Up>
	_LIBCPP_INLINE_VISIBILITY
	constexpr value_type value_or(_Up&& __v) const&
	{
	static_assert(is_copy_constructible_v<value_type>,
	"optional<T>::value_or: T must be copy constructible");
	static_assert(is_convertible_v<_Up, value_type>,
	"optional<T>::value_or: U must be convertible to T");
	return this->has_value() ? this->__get() :
	static_cast<value_type>(_VSTD::forward<_Up>(__v));
	}

	template <class _Up>
	_LIBCPP_INLINE_VISIBILITY
	value_type value_or(_Up&& __v) &&
	{
	static_assert(is_move_constructible_v<value_type>,
	"optional<T>::value_or: T must be move constructible");
	static_assert(is_convertible_v<_Up, value_type>,
	"optional<T>::value_or: U must be convertible to T");
	return this->has_value() ? _VSTD::move(this->__get()) :
	static_cast<value_type>(_VSTD::forward<_Up>(__v));
	}

	using __base::reset;

	private:
	template <class _Up>
	_LIBCPP_INLINE_VISIBILITY
	static _Up*
	__operator_arrow(true_type, _Up& __x)
	{
	return _VSTD::addressof(__x);
	}

	template <class _Up>
	_LIBCPP_INLINE_VISIBILITY
	static constexpr _Up*
	__operator_arrow(false_type, _Up& __x)
	{
	return &__x;
	}
	};

	// Comparisons between optionals
	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator==(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (static_cast<bool>(__x) != static_cast<bool>(__y))
	return false;
	if (!static_cast<bool>(__x))
	return true;
	return __x == __y;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator!=(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (static_cast<bool>(__x) != static_cast<bool>(__y))
	return true;
	if (!static_cast<bool>(__x))
	return false;
	return __x != __y;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (!static_cast<bool>(__y))
	return false;
	if (!static_cast<bool>(__x))
	return true;
	return __x < __y;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (!static_cast<bool>(__x))
	return false;
	if (!static_cast<bool>(__y))
	return true;
	return __x > __y;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<=(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (!static_cast<bool>(__x))
	return true;
	if (!static_cast<bool>(__y))
	return false;
	return __x <= __y;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>=(const optional<_Tp>& __x, const optional<_Tp>& __y)
	{
	if (!static_cast<bool>(__y))
	return true;
	if (!static_cast<bool>(__x))
	return false;
	return __x >= __y;
	}

	// Comparisons with nullopt
	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator==(const optional<_Tp>& __x, nullopt_t) noexcept
	{
	return !static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator==(nullopt_t, const optional<_Tp>& __x) noexcept
	{
	return !static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator!=(const optional<_Tp>& __x, nullopt_t) noexcept
	{
	return static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator!=(nullopt_t, const optional<_Tp>& __x) noexcept
	{
	return static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator<(const optional<_Tp>&, nullopt_t) noexcept
	{
	return false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator<(nullopt_t, const optional<_Tp>& __x) noexcept
	{
	return static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator<=(const optional<_Tp>& __x, nullopt_t) noexcept
	{
	return !static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator<=(nullopt_t, const optional<_Tp>&) noexcept
	{
	return true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator>(const optional<_Tp>& __x, nullopt_t) noexcept
	{
	return static_cast<bool>(__x);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator>(nullopt_t, const optional<_Tp>&) noexcept
	{
	return false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator>=(const optional<_Tp>&, nullopt_t) noexcept
	{
	return true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	bool
	operator>=(nullopt_t, const optional<_Tp>& __x) noexcept
	{
	return !static_cast<bool>(__x);
	}

	// Comparisons with T
	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator==(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x == __v : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator==(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v == *__x : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator!=(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x != __v : true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator!=(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v != *__x : true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x < __v : true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v < *__x : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<=(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x <= __v : true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator<=(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v <= *__x : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x > __v : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v > *__x : true;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>=(const optional<_Tp>& __x, const _Tp& __v)
	{
	return static_cast<bool>(__x) ? *__x >= __v : false;
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	enable_if_t<
	is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
	_VSTD::declval<const _Tp&>()), bool>,
	bool
	>
	operator>=(const _Tp& __v, const optional<_Tp>& __x)
	{
	return static_cast<bool>(__x) ? __v >= *__x : true;
	}


	template <class _Tp>
	inline _LIBCPP_INLINE_VISIBILITY
	enable_if_t<
	is_move_constructible_v<_Tp> && is_swappable_v<_Tp>,
	void
	>
	swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y)))
	{
	__x.swap(__y);
	}

	template <class _Tp>
	_LIBCPP_INLINE_VISIBILITY constexpr
	optional<decay_t<_Tp>> make_optional(_Tp&& __v)
	{
	return optional<decay_t<_Tp>>(_VSTD::forward<_Tp>(__v));
	}

	template <class _Tp, class... _Args>
	_LIBCPP_INLINE_VISIBILITY constexpr
	optional<_Tp> make_optional(_Args&&... __args)
	{
	return optional<_Tp>(in_place, _VSTD::forward<_Args>(__args)...);
	}

	template <class _Tp, class _Up, class... _Args>
	_LIBCPP_INLINE_VISIBILITY constexpr
	optional<_Tp> make_optional(initializer_list<_Up> __il, _Args&&... __args)
	{
	return optional<_Tp>(in_place, __il, _VSTD::forward<_Args>(__args)...);
	}

	template <class _Tp>
	struct _LIBCPP_TEMPLATE_VIS hash<optional<_Tp> >
	{
	typedef optional<_Tp> argument_type;
	typedef size_t result_type;

	_LIBCPP_INLINE_VISIBILITY
	result_type operator()(const argument_type& __opt) const _NOEXCEPT
	{
	return static_cast<bool>(__opt) ? hash<_Tp>()(*__opt) : 0;
	}
	};

	_LIBCPP_END_NAMESPACE_STD

	#endif // _LIBCPP_STD_VER > 14

	#endif // _LIBCPP_OPTIONAL
	Index: projects/clang400-import/contrib/libc++/include/variant
	===================================================================
	--- projects/clang400-import/contrib/libc++/include/variant (revision 313642)
	+++ projects/clang400-import/contrib/libc++/include/variant (revision 313643)
	@@ -1,1568 +1,1568 @@
	// -- C++ --
	//===------------------------------ variant -------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_VARIANT
	#define _LIBCPP_VARIANT

	/*
	variant synopsis

	namespace std {

	// 20.7.2, class template variant
	template <class... Types>
	class variant {
	public:

	// 20.7.2.1, constructors
	constexpr variant() noexcept(see below);
	variant(const variant&);
	variant(variant&&) noexcept(see below);

	template <class T> constexpr variant(T&&) noexcept(see below);

	template <class T, class... Args>
	constexpr explicit variant(in_place_type_t<T>, Args&&...);

	template <class T, class U, class... Args>
	constexpr explicit variant(
	in_place_type_t<T>, initializer_list<U>, Args&&...);

	template <size_t I, class... Args>
	constexpr explicit variant(in_place_index_t<I>, Args&&...);

	template <size_t I, class U, class... Args>
	constexpr explicit variant(
	in_place_index_t<I>, initializer_list<U>, Args&&...);

	// 20.7.2.2, destructor
	~variant();

	// 20.7.2.3, assignment
	variant& operator=(const variant&);
	variant& operator=(variant&&) noexcept(see below);

	template <class T> variant& operator=(T&&) noexcept(see below);

	// 20.7.2.4, modifiers
	template <class T, class... Args>
	void emplace(Args&&...);

	template <class T, class U, class... Args>
	void emplace(initializer_list<U>, Args&&...);

	template <size_t I, class... Args>
	void emplace(Args&&...);

	template <size_t I, class U, class... Args>
	void emplace(initializer_list<U>, Args&&...);

	// 20.7.2.5, value status
	constexpr bool valueless_by_exception() const noexcept;
	constexpr size_t index() const noexcept;

	// 20.7.2.6, swap
	void swap(variant&) noexcept(see below);
	};

	// 20.7.3, variant helper classes
	template <class T> struct variant_size; // undefined

	template <class T>
	constexpr size_t variant_size_v = variant_size<T>::value;

	template <class T> struct variant_size<const T>;
	template <class T> struct variant_size<volatile T>;
	template <class T> struct variant_size<const volatile T>;

	template <class... Types>
	struct variant_size<variant<Types...>>;

	template <size_t I, class T> struct variant_alternative; // undefined

	template <size_t I, class T>
	using variant_alternative_t = typename variant_alternative<I, T>::type;

	template <size_t I, class T> struct variant_alternative<I, const T>;
	template <size_t I, class T> struct variant_alternative<I, volatile T>;
	template <size_t I, class T> struct variant_alternative<I, const volatile T>;

	template <size_t I, class... Types>
	struct variant_alternative<I, variant<Types...>>;

	constexpr size_t variant_npos = -1;

	// 20.7.4, value access
	template <class T, class... Types>
	constexpr bool holds_alternative(const variant<Types...>&) noexcept;

	template <size_t I, class... Types>
	constexpr variant_alternative_t<I, variant<Types...>>&
	get(variant<Types...>&);

	template <size_t I, class... Types>
	constexpr variant_alternative_t<I, variant<Types...>>&&
	get(variant<Types...>&&);

	template <size_t I, class... Types>
	constexpr variant_alternative_t<I, variant<Types...>> const&
	get(const variant<Types...>&);

	template <size_t I, class... Types>
	constexpr variant_alternative_t<I, variant<Types...>> const&&
	get(const variant<Types...>&&);

	template <class T, class... Types>
	constexpr T& get(variant<Types...>&);

	template <class T, class... Types>
	constexpr T&& get(variant<Types...>&&);

	template <class T, class... Types>
	constexpr const T& get(const variant<Types...>&);

	template <class T, class... Types>
	constexpr const T&& get(const variant<Types...>&&);

	template <size_t I, class... Types>
	constexpr add_pointer_t<variant_alternative_t<I, variant<Types...>>>
	get_if(variant<Types...>*) noexcept;

	template <size_t I, class... Types>
	constexpr add_pointer_t<const variant_alternative_t<I, variant<Types...>>>
	get_if(const variant<Types...>*) noexcept;

	template <class T, class... Types>
	constexpr add_pointer_t<T>
	get_if(variant<Types...>*) noexcept;

	template <class T, class... Types>
	constexpr add_pointer_t<const T>
	get_if(const variant<Types...>*) noexcept;

	// 20.7.5, relational operators
	template <class... Types>
	constexpr bool operator==(const variant<Types...>&, const variant<Types...>&);

	template <class... Types>
	constexpr bool operator!=(const variant<Types...>&, const variant<Types...>&);

	template <class... Types>
	constexpr bool operator<(const variant<Types...>&, const variant<Types...>&);

	template <class... Types>
	constexpr bool operator>(const variant<Types...>&, const variant<Types...>&);

	template <class... Types>
	constexpr bool operator<=(const variant<Types...>&, const variant<Types...>&);

	template <class... Types>
	constexpr bool operator>=(const variant<Types...>&, const variant<Types...>&);

	// 20.7.6, visitation
	template <class Visitor, class... Variants>
	constexpr see below visit(Visitor&&, Variants&&...);

	// 20.7.7, class monostate
	struct monostate;

	// 20.7.8, monostate relational operators
	constexpr bool operator<(monostate, monostate) noexcept;
	constexpr bool operator>(monostate, monostate) noexcept;
	constexpr bool operator<=(monostate, monostate) noexcept;
	constexpr bool operator>=(monostate, monostate) noexcept;
	constexpr bool operator==(monostate, monostate) noexcept;
	constexpr bool operator!=(monostate, monostate) noexcept;

	// 20.7.9, specialized algorithms
	template <class... Types>
	void swap(variant<Types...>&, variant<Types...>&) noexcept(see below);

	// 20.7.10, class bad_variant_access
	class bad_variant_access;

	// 20.7.11, hash support
	template <class T> struct hash;
	template <class... Types> struct hash<variant<Types...>>;
	template <> struct hash<monostate>;

	} // namespace std

	*/

	#include <__config>
	#include <__tuple>
	#include <array>
	#include <exception>
	#include <functional>
	#include <initializer_list>
	#include <new>
	#include <tuple>
	#include <type_traits>
	#include <utility>

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	namespace std { // explicitly not using versioning namespace

	class _LIBCPP_EXCEPTION_ABI bad_variant_access : public exception {
	public:
	virtual const char* what() const _NOEXCEPT;
	};

	} // namespace std

	_LIBCPP_BEGIN_NAMESPACE_STD

	#if _LIBCPP_STD_VER > 14

	_LIBCPP_NORETURN
	inline _LIBCPP_INLINE_VISIBILITY
	void __throw_bad_variant_access() {
	#ifndef _LIBCPP_NO_EXCEPTIONS
	throw bad_variant_access();
	#else
	_VSTD::abort();
	#endif
	}

	template <class... _Types>
	class _LIBCPP_TEMPLATE_VIS variant;

	template <class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_size;

	template <class _Tp>
	constexpr size_t variant_size_v = variant_size<_Tp>::value;

	template <class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_size<const _Tp> : variant_size<_Tp> {};

	template <class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_size<volatile _Tp> : variant_size<_Tp> {};

	template <class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_size<const volatile _Tp>
	: variant_size<_Tp> {};

	template <class... _Types>
	struct _LIBCPP_TEMPLATE_VIS variant_size<variant<_Types...>>
	: integral_constant<size_t, sizeof...(_Types)> {};

	template <size_t _Ip, class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_alternative;

	template <size_t _Ip, class _Tp>
	using variant_alternative_t = typename variant_alternative<_Ip, _Tp>::type;

	template <size_t _Ip, class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, const _Tp>
	: add_const<variant_alternative_t<_Ip, _Tp>> {};

	template <size_t _Ip, class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, volatile _Tp>
	: add_volatile<variant_alternative_t<_Ip, _Tp>> {};

	template <size_t _Ip, class _Tp>
	struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, const volatile _Tp>
	: add_cv<variant_alternative_t<_Ip, _Tp>> {};

	template <size_t _Ip, class... _Types>
	struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>> {
	static_assert(_Ip < sizeof...(_Types));
	using type = __type_pack_element<_Ip, _Types...>;
	};

	constexpr size_t variant_npos = static_cast<size_t>(-1);
	constexpr unsigned int __variant_npos = static_cast<unsigned int>(-1);

	namespace __find_detail {

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr size_t __find_index() {
	constexpr bool __matches[] = {is_same_v<_Tp, _Types>...};
	size_t __result = __not_found;
	for (size_t __i = 0; __i < sizeof...(_Types); ++__i) {
	if (__matches[__i]) {
	if (__result != __not_found) {
	return __ambiguous;
	}
	__result = __i;
	}
	}
	return __result;
	}

	template <size_t _Index>
	struct __find_unambiguous_index_sfinae_impl
	: integral_constant<size_t, _Index> {};

	template <>
	struct __find_unambiguous_index_sfinae_impl<__not_found> {};

	template <>
	struct __find_unambiguous_index_sfinae_impl<__ambiguous> {};

	template <class _Tp, class... _Types>
	struct __find_unambiguous_index_sfinae
	: __find_unambiguous_index_sfinae_impl<__find_index<_Tp, _Types...>()> {};

	} // namespace __find_detail

	namespace __variant_detail {

	struct __valueless_t {};

	enum class _Trait { _TriviallyAvailable, _Available, _Unavailable };

	template <typename _Tp,
	template <typename> class _IsTriviallyAvailable,
	template <typename> class _IsAvailable>
	constexpr _Trait __trait =
	_IsTriviallyAvailable<_Tp>::value
	? _Trait::_TriviallyAvailable
	: _IsAvailable<_Tp>::value ? _Trait::_Available : _Trait::_Unavailable;

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr _Trait __common_trait(initializer_list<_Trait> __traits) {
	_Trait __result = _Trait::_TriviallyAvailable;
	for (_Trait __t : __traits) {
	if (static_cast<int>(__t) > static_cast<int>(__result)) {
	__result = __t;
	}
	}
	return __result;
	}

	template <typename... _Types>
	struct __traits {
	static constexpr _Trait __copy_constructible_trait =
	__common_trait({__trait<_Types,
	is_trivially_copy_constructible,
	is_copy_constructible>...});

	static constexpr _Trait __move_constructible_trait =
	__common_trait({__trait<_Types,
	is_trivially_move_constructible,
	is_move_constructible>...});

	static constexpr _Trait __copy_assignable_trait = __common_trait(
	{__copy_constructible_trait,
	__move_constructible_trait,
	__trait<_Types, is_trivially_copy_assignable, is_copy_assignable>...});

	static constexpr _Trait __move_assignable_trait = __common_trait(
	{__move_constructible_trait,
	__trait<_Types, is_trivially_move_assignable, is_move_assignable>...});

	static constexpr _Trait __destructible_trait = __common_trait(
	{__trait<_Types, is_trivially_destructible, is_destructible>...});
	};

	namespace __access {

	struct __union {
	template <class _Vp>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<0>) {
	return _VSTD::forward<_Vp>(__v).__head;
	}

	template <class _Vp, size_t _Ip>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<_Ip>) {
	return __get_alt(_VSTD::forward<_Vp>(__v).__tail, in_place_index<_Ip - 1>);
	}
	};

	struct __base {
	template <size_t _Ip, class _Vp>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __get_alt(_Vp&& __v) {
	return __union::__get_alt(_VSTD::forward<_Vp>(__v).__data,
	in_place_index<_Ip>);
	}
	};

	struct __variant {
	template <size_t _Ip, class _Vp>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __get_alt(_Vp&& __v) {
	return __base::__get_alt<_Ip>(_VSTD::forward<_Vp>(__v).__impl);
	}
	};

	} // namespace __access

	namespace __visitation {

	struct __base {
	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto)
	__visit_alt_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
	constexpr auto __fdiagonal =
	__make_fdiagonal<_Visitor&&,
	decltype(_VSTD::forward<_Vs>(__vs).__as_base())...>();
	return __fdiagonal[__index](_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Vs>(__vs).__as_base()...);
	}

	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto) __visit_alt(_Visitor&& __visitor,
	_Vs&&... __vs) {
	constexpr auto __fmatrix =
	__make_fmatrix<_Visitor&&,
	decltype(_VSTD::forward<_Vs>(__vs).__as_base())...>();
	const size_t __indices[] = {__vs.index()...};
	return __at(__fmatrix, __indices)(_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Vs>(__vs).__as_base()...);
	}

	private:
	template <class _Tp>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr const _Tp& __at_impl(const _Tp& __elem, const size_t*) {
	return __elem;
	}

	template <class _Tp, size_t _Np>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __at_impl(const array<_Tp, _Np>& __elems,
	const size_t* __index) {
	return __at_impl(__elems[*__index], __index + 1);
	}

	template <class _Tp, size_t _Np, size_t _Ip>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __at(const array<_Tp, _Np>& __elems,
	const size_t (&__indices)[_Ip]) {
	return __at_impl(__elems, begin(__indices));
	}

	template <class _Fp, class... _Fs>
	static constexpr void __std_visit_visitor_return_type_check() {
	static_assert(
	__all<is_same_v<_Fp, _Fs>...>::value,
	"`std::visit` requires the visitor to have a single return type.");
	}

	template <class... _Fs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_farray(_Fs&&... __fs) {
	__std_visit_visitor_return_type_check<decay_t<_Fs>...>();
	using __result = array<common_type_t<decay_t<_Fs>...>, sizeof...(_Fs)>;
	return __result{{_VSTD::forward<_Fs>(__fs)...}};
	}

	template <class _Fp, class... _Vs, size_t... _Is>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_dispatch(index_sequence<_Is...>) {
	struct __dispatcher {
	static constexpr decltype(auto) __dispatch(_Fp __f, _Vs... __vs) {
	return __invoke_constexpr(
	static_cast<_Fp>(__f),
	__access::__base::__get_alt<_Is>(static_cast<_Vs>(__vs))...);
	}
	};
	return _VSTD::addressof(__dispatcher::__dispatch);
	}

	template <size_t _Ip, class _Fp, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fdiagonal_impl() {
	return __make_dispatch<_Fp, _Vs...>(
	index_sequence<(__identity<_Vs>{}, _Ip)...>{});
	}

	template <class _Fp, class... _Vs, size_t... _Is>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fdiagonal_impl(index_sequence<_Is...>) {
	return __base::__make_farray(__make_fdiagonal_impl<_Is, _Fp, _Vs...>()...);
	}

	template <class _Fp, class _Vp, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fdiagonal() {
	constexpr size_t _Np = decay_t<_Vp>::__size();
	static_assert(__all<(_Np == decay_t<_Vs>::__size())...>::value);
	return __make_fdiagonal_impl<_Fp, _Vp, _Vs...>(make_index_sequence<_Np>{});
	}

	template <class _Fp, class... _Vs, size_t... _Is>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fmatrix_impl(index_sequence<_Is...> __is) {
	return __make_dispatch<_Fp, _Vs...>(__is);
	}

	template <class _Fp, class... _Vs, size_t... _Is, size_t... _Js, class... _Ls>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fmatrix_impl(index_sequence<_Is...>,
	index_sequence<_Js...>,
	_Ls... __ls) {
	return __base::__make_farray(__make_fmatrix_impl<_Fp, _Vs...>(
	index_sequence<_Is..., _Js>{}, __ls...)...);
	}

	template <class _Fp, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_fmatrix() {
	return __make_fmatrix_impl<_Fp, _Vs...>(
	index_sequence<>{}, make_index_sequence<decay_t<_Vs>::__size()>{}...);
	}
	};

	struct __variant {
	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto)
	__visit_alt_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
	return __base::__visit_alt_at(__index,
	_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Vs>(__vs).__impl...);
	}

	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto) __visit_alt(_Visitor&& __visitor,
	_Vs&&... __vs) {
	return __base::__visit_alt(_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Vs>(__vs).__impl...);
	}

	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto)
	__visit_value_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
	return __visit_alt_at(
	__index,
	__make_value_visitor(_VSTD::forward<_Visitor>(__visitor)),
	_VSTD::forward<_Vs>(__vs)...);
	}

	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr decltype(auto) __visit_value(_Visitor&& __visitor,
	_Vs&&... __vs) {
	return __visit_alt(
	__make_value_visitor(_VSTD::forward<_Visitor>(__visitor)),
	_VSTD::forward<_Vs>(__vs)...);
	}

	private:
	template <class _Visitor, class... _Values>
	static constexpr void __std_visit_exhaustive_visitor_check() {
	static_assert(is_callable_v<_Visitor(_Values...)>,
	"`std::visit` requires the visitor to be exhaustive.");
	}

	template <class _Visitor>
	struct __value_visitor {
	template <class... _Alts>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr decltype(auto) operator()(_Alts&&... __alts) const {
	__std_visit_exhaustive_visitor_check<
	_Visitor,
	- decltype(_VSTD::forward<_Alts>(__alts).__value)...>();
	+ decltype((_VSTD::forward<_Alts>(__alts).__value))...>();
	return __invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Alts>(__alts).__value...);
	}
	_Visitor&& __visitor;
	};

	template <class _Visitor>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto __make_value_visitor(_Visitor&& __visitor) {
	return __value_visitor<_Visitor>{_VSTD::forward<_Visitor>(__visitor)};
	}
	};

	} // namespace __visitation

	template <size_t _Index, class _Tp>
	struct _LIBCPP_TEMPLATE_VIS __alt {
	using __value_type = _Tp;

	template <class... _Args>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr __alt(in_place_t, _Args&&... __args)
	: __value(_VSTD::forward<_Args>(__args)...) {}

	__value_type __value;
	};

	template <_Trait _DestructibleTrait, size_t _Index, class... _Types>
	union _LIBCPP_TEMPLATE_VIS __union;

	template <_Trait _DestructibleTrait, size_t _Index>
	union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};

	#define _LIBCPP_VARIANT_UNION(destructible_trait, destructor) \
	template <size_t _Index, class _Tp, class... _Types> \
	union _LIBCPP_TEMPLATE_VIS __union<destructible_trait, \
	_Index, \
	_Tp, \
	_Types...> { \
	public: \
	inline _LIBCPP_INLINE_VISIBILITY \
	explicit constexpr __union(__valueless_t) noexcept : __dummy{} {} \
	\
	template <class... _Args> \
	inline _LIBCPP_INLINE_VISIBILITY \
	explicit constexpr __union(in_place_index_t<0>, _Args&&... __args) \
	: __head(in_place, _VSTD::forward<_Args>(__args)...) {} \
	\
	template <size_t _Ip, class... _Args> \
	inline _LIBCPP_INLINE_VISIBILITY \
	explicit constexpr __union(in_place_index_t<_Ip>, _Args&&... __args) \
	: __tail(in_place_index<_Ip - 1>, _VSTD::forward<_Args>(__args)...) {} \
	\
	__union(const __union&) = default; \
	__union(__union&&) = default; \
	\
	destructor \
	\
	__union& operator=(const __union&) = default; \
	__union& operator=(__union&&) = default; \
	\
	private: \
	char __dummy; \
	__alt<_Index, _Tp> __head; \
	__union<destructible_trait, _Index + 1, _Types...> __tail; \
	\
	friend struct __access::__union; \
	}

	_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable, ~__union() = default;);
	_LIBCPP_VARIANT_UNION(_Trait::_Available, ~__union() {});
	_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, ~__union() = delete;);

	#undef _LIBCPP_VARIANT_UNION

	template <_Trait _DestructibleTrait, class... _Types>
	class _LIBCPP_TEMPLATE_VIS __base {
	public:
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr __base(__valueless_t tag) noexcept
	: __data(tag), __index(__variant_npos) {}

	template <size_t _Ip, class... _Args>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr __base(in_place_index_t<_Ip>, _Args&&... __args)
	:
	__data(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...),
	__index(_Ip) {}

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool valueless_by_exception() const noexcept {
	return index() == variant_npos;
	}

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr size_t index() const noexcept {
	return __index == __variant_npos ? variant_npos : __index;
	}

	protected:
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr auto&& __as_base() & { return *this; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr auto&& __as_base() && { return _VSTD::move(*this); }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr auto&& __as_base() const & { return *this; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr auto&& __as_base() const && { return _VSTD::move(*this); }

	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr size_t __size() { return sizeof...(_Types); }

	__union<_DestructibleTrait, 0, _Types...> __data;
	unsigned int __index;

	friend struct __access::__base;
	friend struct __visitation::__base;
	};

	template <class _Traits, _Trait = _Traits::__destructible_trait>
	class _LIBCPP_TEMPLATE_VIS __destructor;

	#define _LIBCPP_VARIANT_DESTRUCTOR(destructible_trait, destructor, destroy) \
	template <class... _Types> \
	class _LIBCPP_TEMPLATE_VIS __destructor<__traits<_Types...>, \
	destructible_trait> \
	: public __base<destructible_trait, _Types...> { \
	using __base_type = __base<destructible_trait, _Types...>; \
	\
	public: \
	using __base_type::__base_type; \
	using __base_type::operator=; \
	\
	__destructor(const __destructor&) = default; \
	__destructor(__destructor&&) = default; \
	destructor \
	__destructor& operator=(const __destructor&) = default; \
	__destructor& operator=(__destructor&&) = default; \
	\
	protected: \
	inline _LIBCPP_INLINE_VISIBILITY \
	destroy \
	}

	_LIBCPP_VARIANT_DESTRUCTOR(
	_Trait::_TriviallyAvailable,
	~__destructor() = default;,
	void __destroy() noexcept { this->__index = __variant_npos; });

	_LIBCPP_VARIANT_DESTRUCTOR(
	_Trait::_Available,
	~__destructor() { __destroy(); },
	void __destroy() noexcept {
	if (!this->valueless_by_exception()) {
	__visitation::__base::__visit_alt(
	[](auto& __alt) noexcept {
	using __alt_type = decay_t<decltype(__alt)>;
	__alt.~__alt_type();
	},
	*this);
	}
	this->__index = __variant_npos;
	});

	_LIBCPP_VARIANT_DESTRUCTOR(
	_Trait::_Unavailable,
	~__destructor() = delete;,
	void __destroy() noexcept = delete;);

	#undef _LIBCPP_VARIANT_DESTRUCTOR

	template <class _Traits>
	class _LIBCPP_TEMPLATE_VIS __constructor : public __destructor<_Traits> {
	using __base_type = __destructor<_Traits>;

	public:
	using __base_type::__base_type;
	using __base_type::operator=;

	protected:
	template <size_t _Ip, class _Tp, class... _Args>
	inline _LIBCPP_INLINE_VISIBILITY
	static void __construct_alt(__alt<_Ip, _Tp>& __a, _Args&&... __args) {
	::new (_VSTD::addressof(__a))
	__alt<_Ip, _Tp>(in_place, _VSTD::forward<_Args>(__args)...);
	}

	template <class _Rhs>
	inline _LIBCPP_INLINE_VISIBILITY
	static void __generic_construct(__constructor& __lhs, _Rhs&& __rhs) {
	__lhs.__destroy();
	if (!__rhs.valueless_by_exception()) {
	__visitation::__base::__visit_alt_at(
	__rhs.index(),
	[](auto& __lhs_alt, auto&& __rhs_alt) {
	__construct_alt(
	__lhs_alt,
	_VSTD::forward<decltype(__rhs_alt)>(__rhs_alt).__value);
	},
	__lhs, _VSTD::forward<_Rhs>(__rhs));
	__lhs.__index = __rhs.index();
	}
	}
	};

	template <class _Traits, _Trait = _Traits::__move_constructible_trait>
	class _LIBCPP_TEMPLATE_VIS __move_constructor;

	#define _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, \
	move_constructor) \
	template <class... _Types> \
	class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>, \
	move_constructible_trait> \
	: public __constructor<__traits<_Types...>> { \
	using __base_type = __constructor<__traits<_Types...>>; \
	\
	public: \
	using __base_type::__base_type; \
	using __base_type::operator=; \
	\
	__move_constructor(const __move_constructor&) = default; \
	move_constructor \
	~__move_constructor() = default; \
	__move_constructor& operator=(const __move_constructor&) = default; \
	__move_constructor& operator=(__move_constructor&&) = default; \
	}

	_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
	_Trait::_TriviallyAvailable,
	__move_constructor(__move_constructor&& __that) = default;);

	_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
	_Trait::_Available,
	__move_constructor(__move_constructor&& __that) noexcept(
	__all<is_nothrow_move_constructible_v<_Types>...>::value)
	: __move_constructor(__valueless_t{}) {
	this->__generic_construct(*this, _VSTD::move(__that));
	});

	_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
	_Trait::_Unavailable,
	__move_constructor(__move_constructor&&) = delete;);

	#undef _LIBCPP_VARIANT_MOVE_CONSTRUCTOR

	template <class _Traits, _Trait = _Traits::__copy_constructible_trait>
	class _LIBCPP_TEMPLATE_VIS __copy_constructor;

	#define _LIBCPP_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, \
	copy_constructor) \
	template <class... _Types> \
	class _LIBCPP_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, \
	copy_constructible_trait> \
	: public __move_constructor<__traits<_Types...>> { \
	using __base_type = __move_constructor<__traits<_Types...>>; \
	\
	public: \
	using __base_type::__base_type; \
	using __base_type::operator=; \
	\
	copy_constructor \
	__copy_constructor(__copy_constructor&&) = default; \
	~__copy_constructor() = default; \
	__copy_constructor& operator=(const __copy_constructor&) = default; \
	__copy_constructor& operator=(__copy_constructor&&) = default; \
	}

	_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
	_Trait::_TriviallyAvailable,
	__copy_constructor(const __copy_constructor& __that) = default;);

	_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
	_Trait::_Available,
	__copy_constructor(const __copy_constructor& __that)
	: __copy_constructor(__valueless_t{}) {
	this->__generic_construct(*this, __that);
	});

	_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
	_Trait::_Unavailable,
	__copy_constructor(const __copy_constructor&) = delete;);

	#undef _LIBCPP_VARIANT_COPY_CONSTRUCTOR

	template <class _Traits>
	class _LIBCPP_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits> {
	using __base_type = __copy_constructor<_Traits>;

	public:
	using __base_type::__base_type;
	using __base_type::operator=;

	template <size_t _Ip, class... _Args>
	inline _LIBCPP_INLINE_VISIBILITY
	void __emplace(_Args&&... __args) {
	this->__destroy();
	this->__construct_alt(__access::__base::__get_alt<_Ip>(*this),
	_VSTD::forward<_Args>(__args)...);
	this->__index = _Ip;
	}

	protected:
	template <bool _CopyAssign, size_t _Ip, class _Tp, class _Arg>
	inline _LIBCPP_INLINE_VISIBILITY
	void __assign_alt(__alt<_Ip, _Tp>& __a,
	_Arg&& __arg,
	bool_constant<_CopyAssign> __tag) {
	if (this->index() == _Ip) {
	__a.__value = _VSTD::forward<_Arg>(__arg);
	} else {
	struct {
	void operator()(true_type) const {
	__this->__emplace<_Ip>(_Tp(_VSTD::forward<_Arg>(__arg)));
	}
	void operator()(false_type) const {
	__this->__emplace<_Ip>(_VSTD::forward<_Arg>(__arg));
	}
	__assignment* __this;
	_Arg&& __arg;
	} __impl{this, _VSTD::forward<_Arg>(__arg)};
	__impl(__tag);
	}
	}

	template <class _That>
	inline _LIBCPP_INLINE_VISIBILITY
	void __generic_assign(_That&& __that) {
	if (this->valueless_by_exception() && __that.valueless_by_exception()) {
	// do nothing.
	} else if (__that.valueless_by_exception()) {
	this->__destroy();
	} else {
	__visitation::__base::__visit_alt_at(
	__that.index(),
	[this](auto& __this_alt, auto&& __that_alt) {
	this->__assign_alt(
	__this_alt,
	_VSTD::forward<decltype(__that_alt)>(__that_alt).__value,
	is_lvalue_reference<_That>{});
	},
	*this, _VSTD::forward<_That>(__that));
	}
	}
	};

	template <class _Traits, _Trait = _Traits::__move_assignable_trait>
	class _LIBCPP_TEMPLATE_VIS __move_assignment;

	#define _LIBCPP_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, \
	move_assignment) \
	template <class... _Types> \
	class _LIBCPP_TEMPLATE_VIS __move_assignment<__traits<_Types...>, \
	move_assignable_trait> \
	: public __assignment<__traits<_Types...>> { \
	using __base_type = __assignment<__traits<_Types...>>; \
	\
	public: \
	using __base_type::__base_type; \
	using __base_type::operator=; \
	\
	__move_assignment(const __move_assignment&) = default; \
	__move_assignment(__move_assignment&&) = default; \
	~__move_assignment() = default; \
	__move_assignment& operator=(const __move_assignment&) = default; \
	move_assignment \
	}

	_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
	_Trait::_TriviallyAvailable,
	__move_assignment& operator=(__move_assignment&& __that) = default;);

	_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
	_Trait::_Available,
	__move_assignment& operator=(__move_assignment&& __that) noexcept(
	__all<(is_nothrow_move_constructible_v<_Types> &&
	is_nothrow_move_assignable_v<_Types>)...>::value) {
	this->__generic_assign(_VSTD::move(__that));
	return *this;
	});

	_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
	_Trait::_Unavailable,
	__move_assignment& operator=(__move_assignment&&) = delete;);

	#undef _LIBCPP_VARIANT_MOVE_ASSIGNMENT

	template <class _Traits, _Trait = _Traits::__copy_assignable_trait>
	class _LIBCPP_TEMPLATE_VIS __copy_assignment;

	#define _LIBCPP_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, \
	copy_assignment) \
	template <class... _Types> \
	class _LIBCPP_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, \
	copy_assignable_trait> \
	: public __move_assignment<__traits<_Types...>> { \
	using __base_type = __move_assignment<__traits<_Types...>>; \
	\
	public: \
	using __base_type::__base_type; \
	using __base_type::operator=; \
	\
	__copy_assignment(const __copy_assignment&) = default; \
	__copy_assignment(__copy_assignment&&) = default; \
	~__copy_assignment() = default; \
	copy_assignment \
	__copy_assignment& operator=(__copy_assignment&&) = default; \
	}

	_LIBCPP_VARIANT_COPY_ASSIGNMENT(
	_Trait::_TriviallyAvailable,
	__copy_assignment& operator=(const __copy_assignment& __that) = default;);

	_LIBCPP_VARIANT_COPY_ASSIGNMENT(
	_Trait::_Available,
	__copy_assignment& operator=(const __copy_assignment& __that) {
	this->__generic_assign(__that);
	return *this;
	});

	_LIBCPP_VARIANT_COPY_ASSIGNMENT(
	_Trait::_Unavailable,
	__copy_assignment& operator=(const __copy_assignment&) = delete;);

	#undef _LIBCPP_VARIANT_COPY_ASSIGNMENT

	template <class... _Types>
	class _LIBCPP_TEMPLATE_VIS __impl
	: public __copy_assignment<__traits<_Types...>> {
	using __base_type = __copy_assignment<__traits<_Types...>>;

	public:
	using __base_type::__base_type;
	using __base_type::operator=;

	template <size_t _Ip, class _Arg>
	inline _LIBCPP_INLINE_VISIBILITY
	void __assign(_Arg&& __arg) {
	this->__assign_alt(__access::__base::__get_alt<_Ip>(*this),
	_VSTD::forward<_Arg>(__arg),
	false_type{});
	}

	inline _LIBCPP_INLINE_VISIBILITY
	void __swap(__impl& __that) {
	if (this->valueless_by_exception() && __that.valueless_by_exception()) {
	// do nothing.
	} else if (this->index() == __that.index()) {
	__visitation::__base::__visit_alt_at(
	this->index(),
	[](auto& __this_alt, auto& __that_alt) {
	using _VSTD::swap;
	swap(__this_alt.__value, __that_alt.__value);
	},
	*this,
	__that);
	} else {
	__impl* __lhs = this;
	__impl* __rhs = _VSTD::addressof(__that);
	if (__lhs->__move_nothrow() && !__rhs->__move_nothrow()) {
	_VSTD::swap(__lhs, __rhs);
	}
	__impl __tmp(_VSTD::move(*__rhs));
	#ifndef _LIBCPP_NO_EXCEPTIONS
	// EXTENSION: When the move construction of `__lhs` into `__rhs` throws
	// and `__tmp` is nothrow move constructible then we move `__tmp` back
	// into `__rhs` and provide the strong exception safety guarentee.
	try {
	this->__generic_construct(__rhs, _VSTD::move(__lhs));
	} catch (...) {
	if (__tmp.__move_nothrow()) {
	this->__generic_construct(*__rhs, _VSTD::move(__tmp));
	}
	throw;
	}
	#else
	this->__generic_construct(__rhs, _VSTD::move(__lhs));
	#endif
	this->__generic_construct(*__lhs, _VSTD::move(__tmp));
	}
	}

	private:
	inline _LIBCPP_INLINE_VISIBILITY
	bool __move_nothrow() const {
	constexpr bool __results[] = {is_nothrow_move_constructible_v<_Types>...};
	return this->valueless_by_exception() \|\| __results[this->index()];
	}
	};

	template <class... _Types>
	struct __overload;

	template <>
	struct __overload<> { void operator()() const; };

	template <class _Tp, class... _Types>
	struct __overload<_Tp, _Types...> : __overload<_Types...> {
	using __overload<_Types...>::operator();
	__identity<_Tp> operator()(_Tp) const;
	};

	template <class _Tp, class... _Types>
	using __best_match_t = typename result_of_t<__overload<_Types...>(_Tp&&)>::type;

	} // __variant_detail

	template <class... _Types>
	class _LIBCPP_TEMPLATE_VIS variant
	: private __sfinae_ctor_base<
	__all<is_copy_constructible_v<_Types>...>::value,
	__all<is_move_constructible_v<_Types>...>::value>,
	private __sfinae_assign_base<
	__all<(is_copy_constructible_v<_Types> &&
	is_move_constructible_v<_Types> &&
	is_copy_assignable_v<_Types>)...>::value,
	__all<(is_move_constructible_v<_Types> &&
	is_move_assignable_v<_Types>)...>::value> {
	static_assert(0 < sizeof...(_Types),
	"variant must consist of at least one alternative.");

	static_assert(__all<!is_array_v<_Types>...>::value,
	"variant can not have an array type as an alternative.");

	static_assert(__all<!is_reference_v<_Types>...>::value,
	"variant can not have a reference type as an alternative.");

	static_assert(__all<!is_void_v<_Types>...>::value,
	"variant can not have a void type as an alternative.");

	using __first_type = variant_alternative_t<0, variant>;

	public:
	template <bool _Dummy = true,
	enable_if_t<__dependent_type<is_default_constructible<__first_type>,
	_Dummy>::value,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr variant() noexcept(is_nothrow_default_constructible_v<__first_type>)
	: __impl(in_place_index<0>) {}

	variant(const variant&) = default;
	variant(variant&&) = default;

	template <
	class _Arg,
	enable_if_t<!is_same_v<decay_t<_Arg>, variant>, int> = 0,
	class _Tp = __variant_detail::__best_match_t<_Arg, _Types...>,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_constructible_v<_Tp, _Arg>, int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr variant(_Arg&& __arg) noexcept(
	is_nothrow_constructible_v<_Tp, _Arg>)
	: __impl(in_place_index<_Ip>, _VSTD::forward<_Arg>(__arg)) {}

	template <size_t _Ip, class... _Args,
	enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
	class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
	enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr variant(
	in_place_index_t<_Ip>,
	_Args&&... __args) noexcept(is_nothrow_constructible_v<_Tp, _Args...>)
	: __impl(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...) {}

	template <
	size_t _Ip,
	class _Up,
	class... _Args,
	enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
	class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
	enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr variant(
	in_place_index_t<_Ip>,
	initializer_list<_Up> __il,
	_Args&&... __args) noexcept(
	is_nothrow_constructible_v<_Tp, initializer_list<_Up>&, _Args...>)
	: __impl(in_place_index<_Ip>, __il, _VSTD::forward<_Args>(__args)...) {}

	template <
	class _Tp,
	class... _Args,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr variant(in_place_type_t<_Tp>, _Args&&... __args) noexcept(
	is_nothrow_constructible_v<_Tp, _Args...>)
	: __impl(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...) {}

	template <
	class _Tp,
	class _Up,
	class... _Args,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	explicit constexpr variant(
	in_place_type_t<_Tp>,
	initializer_list<_Up> __il,
	_Args&&... __args) noexcept(
	is_nothrow_constructible_v<_Tp, initializer_list< _Up>&, _Args...>)
	: __impl(in_place_index<_Ip>, __il, _VSTD::forward<_Args>(__args)...) {}

	~variant() = default;

	variant& operator=(const variant&) = default;
	variant& operator=(variant&&) = default;

	template <
	class _Arg,
	enable_if_t<!is_same_v<decay_t<_Arg>, variant>, int> = 0,
	class _Tp = __variant_detail::__best_match_t<_Arg, _Types...>,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_assignable_v<_Tp&, _Arg> && is_constructible_v<_Tp, _Arg>,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	variant& operator=(_Arg&& __arg) noexcept(
	is_nothrow_assignable_v<_Tp&, _Arg> &&
	is_nothrow_constructible_v<_Tp, _Arg>) {
	__impl.template __assign<_Ip>(_VSTD::forward<_Arg>(__arg));
	return *this;
	}

	template <
	size_t _Ip,
	class... _Args,
	enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
	class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
	enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	void emplace(_Args&&... __args) {
	__impl.template __emplace<_Ip>(_VSTD::forward<_Args>(__args)...);
	}

	template <
	size_t _Ip,
	class _Up,
	class... _Args,
	enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
	class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
	enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	void emplace(initializer_list<_Up> __il, _Args&&... __args) {
	__impl.template __emplace<_Ip>(__il, _VSTD::forward<_Args>(__args)...);
	}

	template <
	class _Tp,
	class... _Args,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	void emplace(_Args&&... __args) {
	__impl.template __emplace<_Ip>(_VSTD::forward<_Args>(__args)...);
	}

	template <
	class _Tp,
	class _Up,
	class... _Args,
	size_t _Ip =
	__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
	enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	void emplace(initializer_list<_Up> __il, _Args&&... __args) {
	__impl.template __emplace<_Ip>(__il, _VSTD::forward<_Args>(__args)...);
	}

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool valueless_by_exception() const noexcept {
	return __impl.valueless_by_exception();
	}

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr size_t index() const noexcept { return __impl.index(); }

	template <
	bool _Dummy = true,
	enable_if_t<
	__all<(
	__dependent_type<is_move_constructible<_Types>, _Dummy>::value &&
	__dependent_type<is_swappable<_Types>, _Dummy>::value)...>::value,
	int> = 0>
	inline _LIBCPP_INLINE_VISIBILITY
	void swap(variant& __that) noexcept(
	__all<(is_nothrow_move_constructible_v<_Types> &&
	is_nothrow_swappable_v<_Types>)...>::value) {
	__impl.__swap(__that.__impl);
	}

	private:
	__variant_detail::__impl<_Types...> __impl;

	friend struct __variant_detail::__access::__variant;
	friend struct __variant_detail::__visitation::__variant;
	};

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool __holds_alternative(const variant<_Types...>& __v) noexcept {
	return __v.index() == _Ip;
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool holds_alternative(const variant<_Types...>& __v) noexcept {
	return __holds_alternative<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
	}

	template <size_t _Ip, class _Vp>
	inline _LIBCPP_INLINE_VISIBILITY
	static constexpr auto&& __generic_get(_Vp&& __v) {
	using __variant_detail::__access::__variant;
	if (!__holds_alternative<_Ip>(__v)) {
	__throw_bad_variant_access();
	}
	return __variant::__get_alt<_Ip>(_VSTD::forward<_Vp>(__v)).__value;
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr variant_alternative_t<_Ip, variant<_Types...>>& get(
	variant<_Types...>& __v) {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get<_Ip>(__v);
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr variant_alternative_t<_Ip, variant<_Types...>>&& get(
	variant<_Types...>&& __v) {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get<_Ip>(_VSTD::move(__v));
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr const variant_alternative_t<_Ip, variant<_Types...>>& get(
	const variant<_Types...>& __v) {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get<_Ip>(__v);
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr const variant_alternative_t<_Ip, variant<_Types...>>&& get(
	const variant<_Types...>&& __v) {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get<_Ip>(_VSTD::move(__v));
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr _Tp& get(variant<_Types...>& __v) {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr _Tp&& get(variant<_Types...>&& __v) {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(
	_VSTD::move(__v));
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr const _Tp& get(const variant<_Types...>& __v) {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr const _Tp&& get(const variant<_Types...>&& __v) {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(
	_VSTD::move(__v));
	}

	template <size_t _Ip, class _Vp>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr auto* __generic_get_if(_Vp* __v) noexcept {
	using __variant_detail::__access::__variant;
	return __v && __holds_alternative<_Ip>(*__v)
	? _VSTD::addressof(__variant::__get_alt<_Ip>(*__v).__value)
	: nullptr;
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr add_pointer_t<variant_alternative_t<_Ip, variant<_Types...>>>
	get_if(variant<_Types...>* __v) noexcept {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get_if<_Ip>(__v);
	}

	template <size_t _Ip, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr add_pointer_t<const variant_alternative_t<_Ip, variant<_Types...>>>
	get_if(const variant<_Types...>* __v) noexcept {
	static_assert(_Ip < sizeof...(_Types));
	static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
	return __generic_get_if<_Ip>(__v);
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr add_pointer_t<_Tp>
	get_if(variant<_Types...>* __v) noexcept {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
	}

	template <class _Tp, class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr add_pointer_t<const _Tp>
	get_if(const variant<_Types...>* __v) noexcept {
	static_assert(!is_void_v<_Tp>);
	return _VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator==(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__lhs.index() != __rhs.index()) return false;
	if (__lhs.valueless_by_exception()) return true;
	return __variant::__visit_value_at(__lhs.index(), equal_to<>{}, __lhs, __rhs);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator!=(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__lhs.index() != __rhs.index()) return true;
	if (__lhs.valueless_by_exception()) return false;
	return __variant::__visit_value_at(
	__lhs.index(), not_equal_to<>{}, __lhs, __rhs);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator<(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__rhs.valueless_by_exception()) return false;
	if (__lhs.valueless_by_exception()) return true;
	if (__lhs.index() < __rhs.index()) return true;
	if (__lhs.index() > __rhs.index()) return false;
	return __variant::__visit_value_at(__lhs.index(), less<>{}, __lhs, __rhs);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator>(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__lhs.valueless_by_exception()) return false;
	if (__rhs.valueless_by_exception()) return true;
	if (__lhs.index() > __rhs.index()) return true;
	if (__lhs.index() < __rhs.index()) return false;
	return __variant::__visit_value_at(__lhs.index(), greater<>{}, __lhs, __rhs);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator<=(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__lhs.valueless_by_exception()) return true;
	if (__rhs.valueless_by_exception()) return false;
	if (__lhs.index() < __rhs.index()) return true;
	if (__lhs.index() > __rhs.index()) return false;
	return __variant::__visit_value_at(
	__lhs.index(), less_equal<>{}, __lhs, __rhs);
	}

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator>=(const variant<_Types...>& __lhs,
	const variant<_Types...>& __rhs) {
	using __variant_detail::__visitation::__variant;
	if (__rhs.valueless_by_exception()) return true;
	if (__lhs.valueless_by_exception()) return false;
	if (__lhs.index() > __rhs.index()) return true;
	if (__lhs.index() < __rhs.index()) return false;
	return __variant::__visit_value_at(
	__lhs.index(), greater_equal<>{}, __lhs, __rhs);
	}

	template <class _Visitor, class... _Vs>
	inline _LIBCPP_INLINE_VISIBILITY
	constexpr decltype(auto) visit(_Visitor&& __visitor, _Vs&&... __vs) {
	using __variant_detail::__visitation::__variant;
	bool __results[] = {__vs.valueless_by_exception()...};
	for (bool __result : __results) {
	if (__result) {
	__throw_bad_variant_access();
	}
	}
	return __variant::__visit_value(_VSTD::forward<_Visitor>(__visitor),
	_VSTD::forward<_Vs>(__vs)...);
	}

	struct _LIBCPP_TEMPLATE_VIS monostate {};

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator<(monostate, monostate) noexcept { return false; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator>(monostate, monostate) noexcept { return false; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator<=(monostate, monostate) noexcept { return true; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator>=(monostate, monostate) noexcept { return true; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator==(monostate, monostate) noexcept { return true; }

	inline _LIBCPP_INLINE_VISIBILITY
	constexpr bool operator!=(monostate, monostate) noexcept { return false; }

	template <class... _Types>
	inline _LIBCPP_INLINE_VISIBILITY
	auto swap(variant<_Types...>& __lhs,
	variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs)))
	-> decltype(__lhs.swap(__rhs)) {
	__lhs.swap(__rhs);
	}

	template <class... _Types>
	struct _LIBCPP_TEMPLATE_VIS hash<variant<_Types...>> {
	using argument_type = variant<_Types...>;
	using result_type = size_t;

	inline _LIBCPP_INLINE_VISIBILITY
	result_type operator()(const argument_type& __v) const {
	using __variant_detail::__visitation::__variant;
	size_t __res =
	__v.valueless_by_exception()
	? 299792458 // Random value chosen by the universe upon creation
	: __variant::__visit_alt(
	[](const auto& __alt) {
	using __alt_type = decay_t<decltype(__alt)>;
	using __value_type = typename __alt_type::__value_type;
	return hash<__value_type>{}(__alt.__value);
	},
	__v);
	return __hash_combine(__res, hash<size_t>{}(__v.index()));
	}
	};

	template <>
	struct _LIBCPP_TEMPLATE_VIS hash<monostate> {
	using argument_type = monostate;
	using result_type = size_t;

	inline _LIBCPP_INLINE_VISIBILITY
	result_type operator()(const argument_type&) const {
	return 66740831; // return a fundamentally attractive random value.
	}
	};

	#endif // _LIBCPP_STD_VER > 14

	_LIBCPP_END_NAMESPACE_STD

	#endif // _LIBCPP_VARIANT
	Index: projects/clang400-import/contrib/libc++/src/optional.cpp
	===================================================================
	--- projects/clang400-import/contrib/libc++/src/optional.cpp (revision 313642)
	+++ projects/clang400-import/contrib/libc++/src/optional.cpp (revision 313643)
	@@ -1,24 +1,28 @@
	//===------------------------ optional.cpp --------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "optional"
	#include "experimental/optional"

	namespace std
	{

	bad_optional_access::~bad_optional_access() _NOEXCEPT = default;

	+const char* bad_optional_access::what() const _NOEXCEPT {
	+ return "bad_optional_access";
	+ }
	+
	} // std

	_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL

	bad_optional_access::~bad_optional_access() _NOEXCEPT = default;

	_LIBCPP_END_NAMESPACE_EXPERIMENTAL
	Index: projects/clang400-import/contrib/libc++
	===================================================================
	--- projects/clang400-import/contrib/libc++ (revision 313642)
	+++ projects/clang400-import/contrib/libc++ (revision 313643)

	Property changes on: projects/clang400-import/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist:r313300-313642
	Index: projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h (revision 313642)
	+++ projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h (revision 313643)
	@@ -1,185 +1,198 @@
	//===- llvm/ADT/ilist_iterator.h - Intrusive List Iterator -------- C++ --==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_ADT_ILIST_ITERATOR_H
	#define LLVM_ADT_ILIST_ITERATOR_H

	#include "llvm/ADT/ilist_node.h"
	#include <cassert>
	#include <cstddef>
	#include <iterator>
	#include <type_traits>

	namespace llvm {

	namespace ilist_detail {

	/// Find const-correct node types.
	template <class OptionsT, bool IsConst> struct IteratorTraits;
	template <class OptionsT> struct IteratorTraits<OptionsT, false> {
	typedef typename OptionsT::value_type value_type;
	typedef typename OptionsT::pointer pointer;
	typedef typename OptionsT::reference reference;
	typedef ilist_node_impl<OptionsT> *node_pointer;
	typedef ilist_node_impl<OptionsT> &node_reference;
	};
	template <class OptionsT> struct IteratorTraits<OptionsT, true> {
	typedef const typename OptionsT::value_type value_type;
	typedef typename OptionsT::const_pointer pointer;
	typedef typename OptionsT::const_reference reference;
	typedef const ilist_node_impl<OptionsT> *node_pointer;
	typedef const ilist_node_impl<OptionsT> &node_reference;
	};

	template <bool IsReverse> struct IteratorHelper;
	template <> struct IteratorHelper<false> : ilist_detail::NodeAccess {
	typedef ilist_detail::NodeAccess Access;
	template <class T> static void increment(T &I) { I = Access::getNext(I); }
	template <class T> static void decrement(T &I) { I = Access::getPrev(I); }
	};
	template <> struct IteratorHelper<true> : ilist_detail::NodeAccess {
	typedef ilist_detail::NodeAccess Access;
	template <class T> static void increment(T &I) { I = Access::getPrev(I); }
	template <class T> static void decrement(T &I) { I = Access::getNext(I); }
	};

	} // end namespace ilist_detail

	/// Iterator for intrusive lists based on ilist_node.
	template <class OptionsT, bool IsReverse, bool IsConst>
	class ilist_iterator : ilist_detail::SpecificNodeAccess<OptionsT> {
	friend ilist_iterator<OptionsT, IsReverse, !IsConst>;
	friend ilist_iterator<OptionsT, !IsReverse, IsConst>;
	friend ilist_iterator<OptionsT, !IsReverse, !IsConst>;

	typedef ilist_detail::IteratorTraits<OptionsT, IsConst> Traits;
	typedef ilist_detail::SpecificNodeAccess<OptionsT> Access;

	public:
	typedef typename Traits::value_type value_type;
	typedef typename Traits::pointer pointer;
	typedef typename Traits::reference reference;
	typedef ptrdiff_t difference_type;
	typedef std::bidirectional_iterator_tag iterator_category;

	typedef typename OptionsT::const_pointer const_pointer;
	typedef typename OptionsT::const_reference const_reference;

	private:
	typedef typename Traits::node_pointer node_pointer;
	typedef typename Traits::node_reference node_reference;

	node_pointer NodePtr;

	public:
	/// Create from an ilist_node.
	explicit ilist_iterator(node_reference N) : NodePtr(&N) {}

	explicit ilist_iterator(pointer NP) : NodePtr(Access::getNodePtr(NP)) {}
	explicit ilist_iterator(reference NR) : NodePtr(Access::getNodePtr(&NR)) {}
	ilist_iterator() : NodePtr(nullptr) {}

	// This is templated so that we can allow constructing a const iterator from
	// a nonconst iterator...
	template <bool RHSIsConst>
	ilist_iterator(
	const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS,
	typename std::enable_if<IsConst \|\| !RHSIsConst, void *>::type = nullptr)
	: NodePtr(RHS.NodePtr) {}

	// This is templated so that we can allow assigning to a const iterator from
	// a nonconst iterator...
	template <bool RHSIsConst>
	typename std::enable_if<IsConst \|\| !RHSIsConst, ilist_iterator &>::type
	operator=(const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS) {
	NodePtr = RHS.NodePtr;
	return *this;
	}

	- /// Convert from an iterator to its reverse.
	+ /// Explicit conversion between forward/reverse iterators.
	///
	- /// TODO: Roll this into the implicit constructor once we're sure that no one
	- /// is relying on the std::reverse_iterator off-by-one semantics.
	+ /// Translate between forward and reverse iterators without changing range
	+ /// boundaries. The resulting iterator will dereference (and have a handle)
	+ /// to the previous node, which is somewhat unexpected; but converting the
	+ /// two endpoints in a range will give the same range in reverse.
	+ ///
	+ /// This matches std::reverse_iterator conversions.
	+ explicit ilist_iterator(
	+ const ilist_iterator<OptionsT, !IsReverse, IsConst> &RHS)
	+ : ilist_iterator(++RHS.getReverse()) {}
	+
	+ /// Get a reverse iterator to the same node.
	+ ///
	+ /// Gives a reverse iterator that will dereference (and have a handle) to the
	+ /// same node. Converting the endpoint iterators in a range will give a
	+ /// different range; for range operations, use the explicit conversions.
	ilist_iterator<OptionsT, !IsReverse, IsConst> getReverse() const {
	if (NodePtr)
	return ilist_iterator<OptionsT, !IsReverse, IsConst>(*NodePtr);
	return ilist_iterator<OptionsT, !IsReverse, IsConst>();
	}

	/// Const-cast.
	ilist_iterator<OptionsT, IsReverse, false> getNonConst() const {
	if (NodePtr)
	return ilist_iterator<OptionsT, IsReverse, false>(
	const_cast<typename ilist_iterator<OptionsT, IsReverse,
	false>::node_reference>(*NodePtr));
	return ilist_iterator<OptionsT, IsReverse, false>();
	}

	// Accessors...
	reference operator*() const {
	assert(!NodePtr->isKnownSentinel());
	return *Access::getValuePtr(NodePtr);
	}
	pointer operator->() const { return &operator*(); }

	// Comparison operators
	friend bool operator==(const ilist_iterator &LHS, const ilist_iterator &RHS) {
	return LHS.NodePtr == RHS.NodePtr;
	}
	friend bool operator!=(const ilist_iterator &LHS, const ilist_iterator &RHS) {
	return LHS.NodePtr != RHS.NodePtr;
	}

	// Increment and decrement operators...
	ilist_iterator &operator--() {
	NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev();
	return *this;
	}
	ilist_iterator &operator++() {
	NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext();
	return *this;
	}
	ilist_iterator operator--(int) {
	ilist_iterator tmp = *this;
	--*this;
	return tmp;
	}
	ilist_iterator operator++(int) {
	ilist_iterator tmp = *this;
	++*this;
	return tmp;
	}

	/// Get the underlying ilist_node.
	node_pointer getNodePtr() const { return static_cast<node_pointer>(NodePtr); }

	/// Check for end. Only valid if ilist_sentinel_tracking<true>.
	bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; }
	};

	template <typename From> struct simplify_type;

	/// Allow ilist_iterators to convert into pointers to a node automatically when
	/// used by the dyn_cast, cast, isa mechanisms...
	///
	/// FIXME: remove this, since there is no implicit conversion to NodeTy.
	template <class OptionsT, bool IsConst>
	struct simplify_type<ilist_iterator<OptionsT, false, IsConst>> {
	typedef ilist_iterator<OptionsT, false, IsConst> iterator;
	typedef typename iterator::pointer SimpleType;

	static SimpleType getSimplifiedValue(const iterator &Node) { return &*Node; }
	};
	template <class OptionsT, bool IsConst>
	struct simplify_type<const ilist_iterator<OptionsT, false, IsConst>>
	: simplify_type<ilist_iterator<OptionsT, false, IsConst>> {};

	} // end namespace llvm

	#endif // LLVM_ADT_ILIST_ITERATOR_H
	Index: projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h (revision 313642)
	+++ projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h (revision 313643)
	@@ -1,266 +1,283 @@
	//===- llvm/CodeGen/MachineInstrBundleIterator.h ----------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Defines an iterator class that bundles MachineInstr.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H
	#define LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H

	#include "llvm/ADT/ilist.h"
	#include <iterator>

	namespace llvm {

	template <class T, bool IsReverse> struct MachineInstrBundleIteratorTraits;
	template <class T> struct MachineInstrBundleIteratorTraits<T, false> {
	typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
	typedef typename list_type::iterator instr_iterator;
	typedef typename list_type::iterator nonconst_instr_iterator;
	typedef typename list_type::const_iterator const_instr_iterator;
	};
	template <class T> struct MachineInstrBundleIteratorTraits<T, true> {
	typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
	typedef typename list_type::reverse_iterator instr_iterator;
	typedef typename list_type::reverse_iterator nonconst_instr_iterator;
	typedef typename list_type::const_reverse_iterator const_instr_iterator;
	};
	template <class T> struct MachineInstrBundleIteratorTraits<const T, false> {
	typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
	typedef typename list_type::const_iterator instr_iterator;
	typedef typename list_type::iterator nonconst_instr_iterator;
	typedef typename list_type::const_iterator const_instr_iterator;
	};
	template <class T> struct MachineInstrBundleIteratorTraits<const T, true> {
	typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
	typedef typename list_type::const_reverse_iterator instr_iterator;
	typedef typename list_type::reverse_iterator nonconst_instr_iterator;
	typedef typename list_type::const_reverse_iterator const_instr_iterator;
	};

	template <bool IsReverse> struct MachineInstrBundleIteratorHelper;
	template <> struct MachineInstrBundleIteratorHelper<false> {
	/// Get the beginning of the current bundle.
	template <class Iterator> static Iterator getBundleBegin(Iterator I) {
	if (!I.isEnd())
	while (I->isBundledWithPred())
	--I;
	return I;
	}

	/// Get the final node of the current bundle.
	template <class Iterator> static Iterator getBundleFinal(Iterator I) {
	if (!I.isEnd())
	while (I->isBundledWithSucc())
	++I;
	return I;
	}

	/// Increment forward ilist iterator.
	template <class Iterator> static void increment(Iterator &I) {
	I = std::next(getBundleFinal(I));
	}

	/// Decrement forward ilist iterator.
	template <class Iterator> static void decrement(Iterator &I) {
	I = getBundleBegin(std::prev(I));
	}
	};

	template <> struct MachineInstrBundleIteratorHelper<true> {
	/// Get the beginning of the current bundle.
	template <class Iterator> static Iterator getBundleBegin(Iterator I) {
	return MachineInstrBundleIteratorHelper<false>::getBundleBegin(
	I.getReverse())
	.getReverse();
	}

	/// Get the final node of the current bundle.
	template <class Iterator> static Iterator getBundleFinal(Iterator I) {
	return MachineInstrBundleIteratorHelper<false>::getBundleFinal(
	I.getReverse())
	.getReverse();
	}

	/// Increment reverse ilist iterator.
	template <class Iterator> static void increment(Iterator &I) {
	I = getBundleBegin(std::next(I));
	}

	/// Decrement reverse ilist iterator.
	template <class Iterator> static void decrement(Iterator &I) {
	I = std::prev(getBundleFinal(I));
	}
	};

	/// MachineBasicBlock iterator that automatically skips over MIs that are
	/// inside bundles (i.e. walk top level MIs only).
	template <typename Ty, bool IsReverse = false>
	class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper<IsReverse> {
	typedef MachineInstrBundleIteratorTraits<Ty, IsReverse> Traits;
	typedef typename Traits::instr_iterator instr_iterator;
	instr_iterator MII;

	public:
	typedef typename instr_iterator::value_type value_type;
	typedef typename instr_iterator::difference_type difference_type;
	typedef typename instr_iterator::pointer pointer;
	typedef typename instr_iterator::reference reference;
	typedef std::bidirectional_iterator_tag iterator_category;

	typedef typename instr_iterator::const_pointer const_pointer;
	typedef typename instr_iterator::const_reference const_reference;

	private:
	typedef typename Traits::nonconst_instr_iterator nonconst_instr_iterator;
	typedef typename Traits::const_instr_iterator const_instr_iterator;
	typedef MachineInstrBundleIterator<
	typename nonconst_instr_iterator::value_type, IsReverse>
	nonconst_iterator;
	typedef MachineInstrBundleIterator<Ty, !IsReverse> reverse_iterator;

	public:
	MachineInstrBundleIterator(instr_iterator MI) : MII(MI) {
	assert((!MI.getNodePtr() \|\| MI.isEnd() \|\| !MI->isBundledWithPred()) &&
	"It's not legal to initialize MachineInstrBundleIterator with a "
	"bundled MI");
	}

	MachineInstrBundleIterator(reference MI) : MII(MI) {
	assert(!MI.isBundledWithPred() && "It's not legal to initialize "
	"MachineInstrBundleIterator with a "
	"bundled MI");
	}
	MachineInstrBundleIterator(pointer MI) : MII(MI) {
	// FIXME: This conversion should be explicit.
	assert((!MI \|\| !MI->isBundledWithPred()) && "It's not legal to initialize "
	"MachineInstrBundleIterator "
	"with a bundled MI");
	}
	// Template allows conversion from const to nonconst.
	template <class OtherTy>
	MachineInstrBundleIterator(
	const MachineInstrBundleIterator<OtherTy, IsReverse> &I,
	typename std::enable_if<std::is_convertible<OtherTy , Ty >::value,
	void *>::type = nullptr)
	: MII(I.getInstrIterator()) {}
	MachineInstrBundleIterator() : MII(nullptr) {}

	+ /// Explicit conversion between forward/reverse iterators.
	+ ///
	+ /// Translate between forward and reverse iterators without changing range
	+ /// boundaries. The resulting iterator will dereference (and have a handle)
	+ /// to the previous node, which is somewhat unexpected; but converting the
	+ /// two endpoints in a range will give the same range in reverse.
	+ ///
	+ /// This matches std::reverse_iterator conversions.
	+ explicit MachineInstrBundleIterator(
	+ const MachineInstrBundleIterator<Ty, !IsReverse> &I)
	+ : MachineInstrBundleIterator(++I.getReverse()) {}
	+
	/// Get the bundle iterator for the given instruction's bundle.
	static MachineInstrBundleIterator getAtBundleBegin(instr_iterator MI) {
	return MachineInstrBundleIteratorHelper<IsReverse>::getBundleBegin(MI);
	}

	reference operator() const { return MII; }
	pointer operator->() const { return &operator*(); }

	/// Check for null.
	bool isValid() const { return MII.getNodePtr(); }

	friend bool operator==(const MachineInstrBundleIterator &L,
	const MachineInstrBundleIterator &R) {
	return L.MII == R.MII;
	}
	friend bool operator==(const MachineInstrBundleIterator &L,
	const const_instr_iterator &R) {
	return L.MII == R; // Avoid assertion about validity of R.
	}
	friend bool operator==(const const_instr_iterator &L,
	const MachineInstrBundleIterator &R) {
	return L == R.MII; // Avoid assertion about validity of L.
	}
	friend bool operator==(const MachineInstrBundleIterator &L,
	const nonconst_instr_iterator &R) {
	return L.MII == R; // Avoid assertion about validity of R.
	}
	friend bool operator==(const nonconst_instr_iterator &L,
	const MachineInstrBundleIterator &R) {
	return L == R.MII; // Avoid assertion about validity of L.
	}
	friend bool operator==(const MachineInstrBundleIterator &L, const_pointer R) {
	return L == const_instr_iterator(R); // Avoid assertion about validity of R.
	}
	friend bool operator==(const_pointer L, const MachineInstrBundleIterator &R) {
	return const_instr_iterator(L) == R; // Avoid assertion about validity of L.
	}
	friend bool operator==(const MachineInstrBundleIterator &L,
	const_reference R) {
	return L == &R; // Avoid assertion about validity of R.
	}
	friend bool operator==(const_reference L,
	const MachineInstrBundleIterator &R) {
	return &L == R; // Avoid assertion about validity of L.
	}

	friend bool operator!=(const MachineInstrBundleIterator &L,
	const MachineInstrBundleIterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const MachineInstrBundleIterator &L,
	const const_instr_iterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const const_instr_iterator &L,
	const MachineInstrBundleIterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const MachineInstrBundleIterator &L,
	const nonconst_instr_iterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const nonconst_instr_iterator &L,
	const MachineInstrBundleIterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const MachineInstrBundleIterator &L, const_pointer R) {
	return !(L == R);
	}
	friend bool operator!=(const_pointer L, const MachineInstrBundleIterator &R) {
	return !(L == R);
	}
	friend bool operator!=(const MachineInstrBundleIterator &L,
	const_reference R) {
	return !(L == R);
	}
	friend bool operator!=(const_reference L,
	const MachineInstrBundleIterator &R) {
	return !(L == R);
	}

	// Increment and decrement operators...
	MachineInstrBundleIterator &operator--() {
	this->decrement(MII);
	return *this;
	}
	MachineInstrBundleIterator &operator++() {
	this->increment(MII);
	return *this;
	}
	MachineInstrBundleIterator operator--(int) {
	MachineInstrBundleIterator Temp = *this;
	--*this;
	return Temp;
	}
	MachineInstrBundleIterator operator++(int) {
	MachineInstrBundleIterator Temp = *this;
	++*this;
	return Temp;
	}

	instr_iterator getInstrIterator() const { return MII; }

	nonconst_iterator getNonConstIterator() const { return MII.getNonConst(); }

	+ /// Get a reverse iterator to the same node.
	+ ///
	+ /// Gives a reverse iterator that will dereference (and have a handle) to the
	+ /// same node. Converting the endpoint iterators in a range will give a
	+ /// different range; for range operations, use the explicit conversions.
	reverse_iterator getReverse() const { return MII.getReverse(); }
	};

	} // end namespace llvm

	#endif
	Index: projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h (revision 313642)
	+++ projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h (revision 313643)
	@@ -1,1255 +1,1261 @@
	//===- PassManager.h - Pass management infrastructure ------------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	/// \file
	///
	/// This header defines various interfaces for pass management in LLVM. There
	/// is no "pass" interface in LLVM per se. Instead, an instance of any class
	/// which supports a method to 'run' it over a unit of IR can be used as
	/// a pass. A pass manager is generally a tool to collect a sequence of passes
	/// which run over a particular IR construct, and run each of them in sequence
	/// over each such construct in the containing IR construct. As there is no
	/// containing IR construct for a Module, a manager for passes over modules
	/// forms the base case which runs its managed passes in sequence over the
	/// single module provided.
	///
	/// The core IR library provides managers for running passes over
	/// modules and functions.
	///
	/// * FunctionPassManager can run over a Module, runs each pass over
	/// a Function.
	/// * ModulePassManager must be directly run, runs each pass over the Module.
	///
	/// Note that the implementations of the pass managers use concept-based
	/// polymorphism as outlined in the "Value Semantics and Concept-based
	/// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base
	/// Class of Evil") by Sean Parent:
	/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations
	/// * http://www.youtube.com/watch?v=_BpMYeUFXv8
	/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil
	///
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_IR_PASSMANAGER_H
	#define LLVM_IR_PASSMANAGER_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/TinyPtrVector.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/PassManagerInternal.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/TypeName.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Support/type_traits.h"
	#include <list>
	#include <memory>
	#include <vector>

	namespace llvm {

	/// A special type used by analysis passes to provide an address that
	/// identifies that particular analysis pass type.
	///
	/// Analysis passes should have a static data member of this type and derive
	/// from the \c AnalysisInfoMixin to get a static ID method used to identify
	/// the analysis in the pass management infrastructure.
	struct alignas(8) AnalysisKey {};

	/// A special type used to provide an address that identifies a set of related
	/// analyses. These sets are primarily used below to mark sets of analyses as
	/// preserved.
	///
	/// For example, a transformation can indicate that it preserves the CFG of a
	/// function by preserving the appropriate AnalysisSetKey. An analysis that
	/// depends only on the CFG can then check if that AnalysisSetKey is preserved;
	/// if it is, the analysis knows that it itself is preserved.
	struct alignas(8) AnalysisSetKey {};

	/// A set of analyses that are preserved following a run of a transformation
	/// pass.
	///
	/// Transformation passes build and return these objects to communicate which
	/// analyses are still valid after the transformation. For most passes this is
	/// fairly simple: if they don't change anything all analyses are preserved,
	/// otherwise only a short list of analyses that have been explicitly updated
	/// are preserved.
	///
	/// This class also lets transformation passes mark abstract sets of analyses
	/// as preserved. A transformation that (say) does not alter the CFG can
	/// indicate such by marking a particular AnalysisSetKey as preserved, and
	/// then analyses can query whether that AnalysisSetKey is preserved.
	///
	/// Finally, this class can represent an "abandoned" analysis, which is
	/// not preserved even if it would be covered by some abstract set of analyses.
	///
	/// Given a `PreservedAnalyses` object, an analysis will typically want to
	/// figure out whether it is preserved. In the example below, MyAnalysisType is
	/// preserved if it's not abandoned, and (a) it's explicitly marked as
	/// preserved, (b), the set AllAnalysesOn<MyIRUnit> is preserved, or (c) both
	/// AnalysisSetA and AnalysisSetB are preserved.
	///
	/// ```
	/// auto PAC = PA.getChecker<MyAnalysisType>();
	/// if (PAC.preserved() \|\| PAC.preservedSet<AllAnalysesOn<MyIRUnit>>() \|\|
	/// (PAC.preservedSet<AnalysisSetA>() &&
	/// PAC.preservedSet<AnalysisSetB>())) {
	/// // The analysis has been successfully preserved ...
	/// }
	/// ```
	class PreservedAnalyses {
	public:
	/// \brief Convenience factory function for the empty preserved set.
	static PreservedAnalyses none() { return PreservedAnalyses(); }

	/// \brief Construct a special preserved set that preserves all passes.
	static PreservedAnalyses all() {
	PreservedAnalyses PA;
	PA.PreservedIDs.insert(&AllAnalysesKey);
	return PA;
	}

	/// Mark an analysis as preserved.
	template <typename AnalysisT> void preserve() { preserve(AnalysisT::ID()); }

	/// \brief Given an analysis's ID, mark the analysis as preserved, adding it
	/// to the set.
	void preserve(AnalysisKey *ID) {
	// Clear this ID from the explicit not-preserved set if present.
	NotPreservedAnalysisIDs.erase(ID);

	// If we're not already preserving all analyses (other than those in
	// NotPreservedAnalysisIDs).
	if (!areAllPreserved())
	PreservedIDs.insert(ID);
	}

	/// Mark an analysis set as preserved.
	template <typename AnalysisSetT> void preserveSet() {
	preserveSet(AnalysisSetT::ID());
	}

	/// Mark an analysis set as preserved using its ID.
	void preserveSet(AnalysisSetKey *ID) {
	// If we're not already in the saturated 'all' state, add this set.
	if (!areAllPreserved())
	PreservedIDs.insert(ID);
	}

	/// Mark an analysis as abandoned.
	///
	/// An abandoned analysis is not preserved, even if it is nominally covered
	/// by some other set or was previously explicitly marked as preserved.
	///
	/// Note that you can only abandon a specific analysis, not a set of
	/// analyses.
	template <typename AnalysisT> void abandon() { abandon(AnalysisT::ID()); }

	/// Mark an analysis as abandoned using its ID.
	///
	/// An abandoned analysis is not preserved, even if it is nominally covered
	/// by some other set or was previously explicitly marked as preserved.
	///
	/// Note that you can only abandon a specific analysis, not a set of
	/// analyses.
	void abandon(AnalysisKey *ID) {
	PreservedIDs.erase(ID);
	NotPreservedAnalysisIDs.insert(ID);
	}

	/// \brief Intersect this set with another in place.
	///
	/// This is a mutating operation on this preserved set, removing all
	/// preserved passes which are not also preserved in the argument.
	void intersect(const PreservedAnalyses &Arg) {
	if (Arg.areAllPreserved())
	return;
	if (areAllPreserved()) {
	*this = Arg;
	return;
	}
	// The intersection requires the union of the explicitly not-preserved
	// IDs and the intersection of the preserved IDs.
	for (auto ID : Arg.NotPreservedAnalysisIDs) {
	PreservedIDs.erase(ID);
	NotPreservedAnalysisIDs.insert(ID);
	}
	for (auto ID : PreservedIDs)
	if (!Arg.PreservedIDs.count(ID))
	PreservedIDs.erase(ID);
	}

	/// \brief Intersect this set with a temporary other set in place.
	///
	/// This is a mutating operation on this preserved set, removing all
	/// preserved passes which are not also preserved in the argument.
	void intersect(PreservedAnalyses &&Arg) {
	if (Arg.areAllPreserved())
	return;
	if (areAllPreserved()) {
	*this = std::move(Arg);
	return;
	}
	// The intersection requires the union of the explicitly not-preserved
	// IDs and the intersection of the preserved IDs.
	for (auto ID : Arg.NotPreservedAnalysisIDs) {
	PreservedIDs.erase(ID);
	NotPreservedAnalysisIDs.insert(ID);
	}
	for (auto ID : PreservedIDs)
	if (!Arg.PreservedIDs.count(ID))
	PreservedIDs.erase(ID);
	}

	/// A checker object that makes it easy to query for whether an analysis or
	/// some set covering it is preserved.
	class PreservedAnalysisChecker {
	friend class PreservedAnalyses;

	const PreservedAnalyses &PA;
	AnalysisKey *const ID;
	const bool IsAbandoned;

	/// A PreservedAnalysisChecker is tied to a particular Analysis because
	/// `preserved()` and `preservedSet()` both return false if the Analysis
	/// was abandoned.
	PreservedAnalysisChecker(const PreservedAnalyses &PA, AnalysisKey *ID)
	: PA(PA), ID(ID), IsAbandoned(PA.NotPreservedAnalysisIDs.count(ID)) {}

	public:
	/// Returns true if the checker's analysis was not abandoned and either
	/// - the analysis is explicitly preserved or
	/// - all analyses are preserved.
	bool preserved() {
	return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) \|\|
	PA.PreservedIDs.count(ID));
	}

	/// Returns true if the checker's analysis was not abandoned and either
	/// - \p AnalysisSetT is explicitly preserved or
	/// - all analyses are preserved.
	template <typename AnalysisSetT> bool preservedSet() {
	AnalysisSetKey *SetID = AnalysisSetT::ID();
	return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) \|\|
	PA.PreservedIDs.count(SetID));
	}
	};

	/// Build a checker for this `PreservedAnalyses` and the specified analysis
	/// type.
	///
	/// You can use the returned object to query whether an analysis was
	/// preserved. See the example in the comment on `PreservedAnalysis`.
	template <typename AnalysisT> PreservedAnalysisChecker getChecker() const {
	return PreservedAnalysisChecker(*this, AnalysisT::ID());
	}

	/// Build a checker for this `PreservedAnalyses` and the specified analysis
	/// ID.
	///
	/// You can use the returned object to query whether an analysis was
	/// preserved. See the example in the comment on `PreservedAnalysis`.
	PreservedAnalysisChecker getChecker(AnalysisKey *ID) const {
	return PreservedAnalysisChecker(*this, ID);
	}

	/// Test whether all analyses are preserved (and none are abandoned).
	///
	/// This is used primarily to optimize for the common case of a transformation
	/// which makes no changes to the IR.
	bool areAllPreserved() const {
	return NotPreservedAnalysisIDs.empty() &&
	PreservedIDs.count(&AllAnalysesKey);
	}

	/// Directly test whether a set of analyses is preserved.
	///
	/// This is only true when no analyses have been explicitly abandoned.
	template <typename AnalysisSetT> bool allAnalysesInSetPreserved() const {
	return allAnalysesInSetPreserved(AnalysisSetT::ID());
	}

	/// Directly test whether a set of analyses is preserved.
	///
	/// This is only true when no analyses have been explicitly abandoned.
	bool allAnalysesInSetPreserved(AnalysisSetKey *SetID) const {
	return NotPreservedAnalysisIDs.empty() &&
	(PreservedIDs.count(&AllAnalysesKey) \|\| PreservedIDs.count(SetID));
	}

	private:
	/// A special key used to indicate all analyses.
	static AnalysisSetKey AllAnalysesKey;

	/// The IDs of analyses and analysis sets that are preserved.
	SmallPtrSet<void *, 2> PreservedIDs;

	/// The IDs of explicitly not-preserved analyses.
	///
	/// If an analysis in this set is covered by a set in `PreservedIDs`, we
	/// consider it not-preserved. That is, `NotPreservedAnalysisIDs` always
	/// "wins" over analysis sets in `PreservedIDs`.
	///
	/// Also, a given ID should never occur both here and in `PreservedIDs`.
	SmallPtrSet<AnalysisKey *, 2> NotPreservedAnalysisIDs;
	};

	// Forward declare the analysis manager template.
	template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;

	/// A CRTP mix-in to automatically provide informational APIs needed for
	/// passes.
	///
	/// This provides some boilerplate for types that are passes.
	template <typename DerivedT> struct PassInfoMixin {
	/// Gets the name of the pass we are mixed into.
	static StringRef name() {
	+ static_assert(std::is_base_of<PassInfoMixin, DerivedT>::value,
	+ "Must pass the derived type as the template argument!");
	StringRef Name = getTypeName<DerivedT>();
	if (Name.startswith("llvm::"))
	Name = Name.drop_front(strlen("llvm::"));
	return Name;
	}
	};

	/// A CRTP mix-in that provides informational APIs needed for analysis passes.
	///
	/// This provides some boilerplate for types that are analysis passes. It
	/// automatically mixes in \c PassInfoMixin.
	template <typename DerivedT>
	struct AnalysisInfoMixin : PassInfoMixin<DerivedT> {
	/// Returns an opaque, unique ID for this analysis type.
	///
	/// This ID is a pointer type that is guaranteed to be 8-byte aligned and thus
	/// suitable for use in sets, maps, and other data structures that use the low
	/// bits of pointers.
	///
	/// Note that this requires the derived type provide a static \c AnalysisKey
	/// member called \c Key.
	///
	/// FIXME: The only reason the mixin type itself can't declare the Key value
	/// is that some compilers cannot correctly unique a templated static variable
	/// so it has the same addresses in each instantiation. The only currently
	/// known platform with this limitation is Windows DLL builds, specifically
	/// building each part of LLVM as a DLL. If we ever remove that build
	/// configuration, this mixin can provide the static key as well.
	- static AnalysisKey *ID() { return &DerivedT::Key; }
	+ static AnalysisKey *ID() {
	+ static_assert(std::is_base_of<AnalysisInfoMixin, DerivedT>::value,
	+ "Must pass the derived type as the template argument!");
	+ return &DerivedT::Key;
	+ }
	};

	/// This templated class represents "all analyses that operate over \<a
	/// particular IR unit\>" (e.g. a Function or a Module) in instances of
	/// PreservedAnalysis.
	///
	/// This lets a transformation say e.g. "I preserved all function analyses".
	///
	/// Note that you must provide an explicit instantiation declaration and
	/// definition for this template in order to get the correct behavior on
	/// Windows. Otherwise, the address of SetKey will not be stable.
	template <typename IRUnitT>
	class AllAnalysesOn {
	public:
	static AnalysisSetKey *ID() { return &SetKey; }

	private:
	static AnalysisSetKey SetKey;
	};

	template <typename IRUnitT> AnalysisSetKey AllAnalysesOn<IRUnitT>::SetKey;

	extern template class AllAnalysesOn<Module>;
	extern template class AllAnalysesOn<Function>;

	/// \brief Manages a sequence of passes over a particular unit of IR.
	///
	/// A pass manager contains a sequence of passes to run over a particular unit
	/// of IR (e.g. Functions, Modules). It is itself a valid pass over that unit of
	/// IR, and when run over some given IR will run each of its contained passes in
	/// sequence. Pass managers are the primary and most basic building block of a
	/// pass pipeline.
	///
	/// When you run a pass manager, you provide an \c AnalysisManager<IRUnitT>
	/// argument. The pass manager will propagate that analysis manager to each
	/// pass it runs, and will call the analysis manager's invalidation routine with
	/// the PreservedAnalyses of each pass it runs.
	template <typename IRUnitT,
	typename AnalysisManagerT = AnalysisManager<IRUnitT>,
	typename... ExtraArgTs>
	class PassManager : public PassInfoMixin<
	PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>> {
	public:
	/// \brief Construct a pass manager.
	///
	/// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
	explicit PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}

	// FIXME: These are equivalent to the default move constructor/move
	// assignment. However, using = default triggers linker errors due to the
	// explicit instantiations below. Find away to use the default and remove the
	// duplicated code here.
	PassManager(PassManager &&Arg)
	: Passes(std::move(Arg.Passes)),
	DebugLogging(std::move(Arg.DebugLogging)) {}

	PassManager &operator=(PassManager &&RHS) {
	Passes = std::move(RHS.Passes);
	DebugLogging = std::move(RHS.DebugLogging);
	return *this;
	}

	/// \brief Run all of the passes in this manager over the given unit of IR.
	/// ExtraArgs are passed to each pass.
	PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
	ExtraArgTs... ExtraArgs) {
	PreservedAnalyses PA = PreservedAnalyses::all();

	if (DebugLogging)
	dbgs() << "Starting " << getTypeName<IRUnitT>() << " pass manager run.\n";

	for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
	if (DebugLogging)
	dbgs() << "Running pass: " << Passes[Idx]->name() << " on "
	<< IR.getName() << "\n";

	PreservedAnalyses PassPA = Passes[Idx]->run(IR, AM, ExtraArgs...);

	// Update the analysis manager as each pass runs and potentially
	// invalidates analyses.
	AM.invalidate(IR, PassPA);

	// Finally, intersect the preserved analyses to compute the aggregate
	// preserved set for this pass manager.
	PA.intersect(std::move(PassPA));

	// FIXME: Historically, the pass managers all called the LLVM context's
	// yield function here. We don't have a generic way to acquire the
	// context and it isn't yet clear what the right pattern is for yielding
	// in the new pass manager so it is currently omitted.
	//IR.getContext().yield();
	}

	// Invaliadtion was handled after each pass in the above loop for the
	// current unit of IR. Therefore, the remaining analysis results in the
	// AnalysisManager are preserved. We mark this with a set so that we don't
	// need to inspect each one individually.
	PA.preserveSet<AllAnalysesOn<IRUnitT>>();

	if (DebugLogging)
	dbgs() << "Finished " << getTypeName<IRUnitT>() << " pass manager run.\n";

	return PA;
	}

	template <typename PassT> void addPass(PassT Pass) {
	typedef detail::PassModel<IRUnitT, PassT, PreservedAnalyses,
	AnalysisManagerT, ExtraArgTs...>
	PassModelT;
	Passes.emplace_back(new PassModelT(std::move(Pass)));
	}

	private:
	typedef detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>
	PassConceptT;

	std::vector<std::unique_ptr<PassConceptT>> Passes;

	/// \brief Flag indicating whether we should do debug logging.
	bool DebugLogging;
	};

	extern template class PassManager<Module>;
	/// \brief Convenience typedef for a pass manager over modules.
	typedef PassManager<Module> ModulePassManager;

	extern template class PassManager<Function>;
	/// \brief Convenience typedef for a pass manager over functions.
	typedef PassManager<Function> FunctionPassManager;

	/// \brief A container for analyses that lazily runs them and caches their
	/// results.
	///
	/// This class can manage analyses for any IR unit where the address of the IR
	/// unit sufficies as its identity.
	template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
	public:
	class Invalidator;

	private:
	// Now that we've defined our invalidator, we can define the concept types.
	typedef detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>
	ResultConceptT;
	typedef detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
	ExtraArgTs...>
	PassConceptT;

	/// \brief List of analysis pass IDs and associated concept pointers.
	///
	/// Requires iterators to be valid across appending new entries and arbitrary
	/// erases. Provides the analysis ID to enable finding iterators to a given
	/// entry in maps below, and provides the storage for the actual result
	/// concept.
	typedef std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>
	AnalysisResultListT;

	/// \brief Map type from IRUnitT pointer to our custom list type.
	typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;

	/// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
	/// iterator into a particular result list (which is where the actual analysis
	/// result is stored).
	typedef DenseMap<std::pair<AnalysisKey , IRUnitT >,
	typename AnalysisResultListT::iterator>
	AnalysisResultMapT;

	public:
	/// API to communicate dependencies between analyses during invalidation.
	///
	/// When an analysis result embeds handles to other analysis results, it
	/// needs to be invalidated both when its own information isn't preserved and
	/// when any of its embedded analysis results end up invalidated. We pass an
	/// \c Invalidator object as an argument to \c invalidate() in order to let
	/// the analysis results themselves define the dependency graph on the fly.
	/// This lets us avoid building building an explicit representation of the
	/// dependencies between analysis results.
	class Invalidator {
	public:
	/// Trigger the invalidation of some other analysis pass if not already
	/// handled and return whether it was in fact invalidated.
	///
	/// This is expected to be called from within a given analysis result's \c
	/// invalidate method to trigger a depth-first walk of all inter-analysis
	/// dependencies. The same \p IR unit and \p PA passed to that result's \c
	/// invalidate method should in turn be provided to this routine.
	///
	/// The first time this is called for a given analysis pass, it will call
	/// the corresponding result's \c invalidate method. Subsequent calls will
	/// use a cache of the results of that initial call. It is an error to form
	/// cyclic dependencies between analysis results.
	///
	/// This returns true if the given analysis's result is invalid. Any
	/// dependecies on it will become invalid as a result.
	template <typename PassT>
	bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
	typedef detail::AnalysisResultModel<IRUnitT, PassT,
	typename PassT::Result,
	PreservedAnalyses, Invalidator>
	ResultModelT;
	return invalidateImpl<ResultModelT>(PassT::ID(), IR, PA);
	}

	/// A type-erased variant of the above invalidate method with the same core
	/// API other than passing an analysis ID rather than an analysis type
	/// parameter.
	///
	/// This is sadly less efficient than the above routine, which leverages
	/// the type parameter to avoid the type erasure overhead.
	bool invalidate(AnalysisKey *ID, IRUnitT &IR, const PreservedAnalyses &PA) {
	return invalidateImpl<>(ID, IR, PA);
	}

	private:
	friend class AnalysisManager;

	template <typename ResultT = ResultConceptT>
	bool invalidateImpl(AnalysisKey *ID, IRUnitT &IR,
	const PreservedAnalyses &PA) {
	// If we've already visited this pass, return true if it was invalidated
	// and false otherwise.
	auto IMapI = IsResultInvalidated.find(ID);
	if (IMapI != IsResultInvalidated.end())
	return IMapI->second;

	// Otherwise look up the result object.
	auto RI = Results.find({ID, &IR});
	assert(RI != Results.end() &&
	"Trying to invalidate a dependent result that isn't in the "
	"manager's cache is always an error, likely due to a stale result "
	"handle!");

	auto &Result = static_cast<ResultT &>(*RI->second->second);

	// Insert into the map whether the result should be invalidated and return
	// that. Note that we cannot reuse IMapI and must do a fresh insert here,
	// as calling invalidate could (recursively) insert things into the map,
	// making any iterator or reference invalid.
	bool Inserted;
	std::tie(IMapI, Inserted) =
	IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, *this)});
	(void)Inserted;
	assert(Inserted && "Should not have already inserted this ID, likely "
	"indicates a dependency cycle!");
	return IMapI->second;
	}

	Invalidator(SmallDenseMap<AnalysisKey *, bool, 8> &IsResultInvalidated,
	const AnalysisResultMapT &Results)
	: IsResultInvalidated(IsResultInvalidated), Results(Results) {}

	SmallDenseMap<AnalysisKey *, bool, 8> &IsResultInvalidated;
	const AnalysisResultMapT &Results;
	};

	/// \brief Construct an empty analysis manager.
	///
	/// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
	AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
	AnalysisManager(AnalysisManager &&) = default;
	AnalysisManager &operator=(AnalysisManager &&) = default;

	/// \brief Returns true if the analysis manager has an empty results cache.
	bool empty() const {
	assert(AnalysisResults.empty() == AnalysisResultLists.empty() &&
	"The storage and index of analysis results disagree on how many "
	"there are!");
	return AnalysisResults.empty();
	}

	/// \brief Clear any cached analysis results for a single unit of IR.
	///
	/// This doesn't invalidate, but instead simply deletes, the relevant results.
	/// It is useful when the IR is being removed and we want to clear out all the
	/// memory pinned for it.
	void clear(IRUnitT &IR) {
	if (DebugLogging)
	dbgs() << "Clearing all analysis results for: " << IR.getName() << "\n";

	auto ResultsListI = AnalysisResultLists.find(&IR);
	if (ResultsListI == AnalysisResultLists.end())
	return;
	// Delete the map entries that point into the results list.
	for (auto &IDAndResult : ResultsListI->second)
	AnalysisResults.erase({IDAndResult.first, &IR});

	// And actually destroy and erase the results associated with this IR.
	AnalysisResultLists.erase(ResultsListI);
	}

	/// \brief Clear all analysis results cached by this AnalysisManager.
	///
	/// Like \c clear(IRUnitT&), this doesn't invalidate the results; it simply
	/// deletes them. This lets you clean up the AnalysisManager when the set of
	/// IR units itself has potentially changed, and thus we can't even look up a
	/// a result and invalidate/clear it directly.
	void clear() {
	AnalysisResults.clear();
	AnalysisResultLists.clear();
	}

	/// \brief Get the result of an analysis pass for a given IR unit.
	///
	/// Runs the analysis if a cached result is not available.
	template <typename PassT>
	typename PassT::Result &getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs) {
	assert(AnalysisPasses.count(PassT::ID()) &&
	"This analysis pass was not registered prior to being queried");
	ResultConceptT &ResultConcept =
	getResultImpl(PassT::ID(), IR, ExtraArgs...);
	typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
	PreservedAnalyses, Invalidator>
	ResultModelT;
	return static_cast<ResultModelT &>(ResultConcept).Result;
	}

	/// \brief Get the cached result of an analysis pass for a given IR unit.
	///
	/// This method never runs the analysis.
	///
	/// \returns null if there is no cached result.
	template <typename PassT>
	typename PassT::Result *getCachedResult(IRUnitT &IR) const {
	assert(AnalysisPasses.count(PassT::ID()) &&
	"This analysis pass was not registered prior to being queried");

	ResultConceptT *ResultConcept = getCachedResultImpl(PassT::ID(), IR);
	if (!ResultConcept)
	return nullptr;

	typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
	PreservedAnalyses, Invalidator>
	ResultModelT;
	return &static_cast<ResultModelT *>(ResultConcept)->Result;
	}

	/// \brief Register an analysis pass with the manager.
	///
	/// The parameter is a callable whose result is an analysis pass. This allows
	/// passing in a lambda to construct the analysis.
	///
	/// The analysis type to register is the type returned by calling the \c
	/// PassBuilder argument. If that type has already been registered, then the
	/// argument will not be called and this function will return false.
	/// Otherwise, we register the analysis returned by calling \c PassBuilder(),
	/// and this function returns true.
	///
	/// (Note: Although the return value of this function indicates whether or not
	/// an analysis was previously registered, there intentionally isn't a way to
	/// query this directly. Instead, you should just register all the analyses
	/// you might want and let this class run them lazily. This idiom lets us
	/// minimize the number of times we have to look up analyses in our
	/// hashtable.)
	template <typename PassBuilderT>
	bool registerPass(PassBuilderT &&PassBuilder) {
	typedef decltype(PassBuilder()) PassT;
	typedef detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
	Invalidator, ExtraArgTs...>
	PassModelT;

	auto &PassPtr = AnalysisPasses[PassT::ID()];
	if (PassPtr)
	// Already registered this pass type!
	return false;

	// Construct a new model around the instance returned by the builder.
	PassPtr.reset(new PassModelT(PassBuilder()));
	return true;
	}

	/// \brief Invalidate a specific analysis pass for an IR module.
	///
	/// Note that the analysis result can disregard invalidation, if it determines
	/// it is in fact still valid.
	template <typename PassT> void invalidate(IRUnitT &IR) {
	assert(AnalysisPasses.count(PassT::ID()) &&
	"This analysis pass was not registered prior to being invalidated");
	invalidateImpl(PassT::ID(), IR);
	}

	/// \brief Invalidate cached analyses for an IR unit.
	///
	/// Walk through all of the analyses pertaining to this unit of IR and
	/// invalidate them, unless they are preserved by the PreservedAnalyses set.
	void invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
	// We're done if all analyses on this IR unit are preserved.
	if (PA.allAnalysesInSetPreserved<AllAnalysesOn<IRUnitT>>())
	return;

	if (DebugLogging)
	dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName()
	<< "\n";

	// Track whether each analysis's result is invalidated in
	// IsResultInvalidated.
	SmallDenseMap<AnalysisKey *, bool, 8> IsResultInvalidated;
	Invalidator Inv(IsResultInvalidated, AnalysisResults);
	AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
	for (auto &AnalysisResultPair : ResultsList) {
	// This is basically the same thing as Invalidator::invalidate, but we
	// can't call it here because we're operating on the type-erased result.
	// Moreover if we instead called invalidate() directly, it would do an
	// unnecessary look up in ResultsList.
	AnalysisKey *ID = AnalysisResultPair.first;
	auto &Result = *AnalysisResultPair.second;

	auto IMapI = IsResultInvalidated.find(ID);
	if (IMapI != IsResultInvalidated.end())
	// This result was already handled via the Invalidator.
	continue;

	// Try to invalidate the result, giving it the Invalidator so it can
	// recursively query for any dependencies it has and record the result.
	// Note that we cannot reuse 'IMapI' here or pre-insert the ID, as
	// Result.invalidate may insert things into the map, invalidating our
	// iterator.
	bool Inserted =
	IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, Inv)})
	.second;
	(void)Inserted;
	assert(Inserted && "Should never have already inserted this ID, likely "
	"indicates a cycle!");
	}

	// Now erase the results that were marked above as invalidated.
	if (!IsResultInvalidated.empty()) {
	for (auto I = ResultsList.begin(), E = ResultsList.end(); I != E;) {
	AnalysisKey *ID = I->first;
	if (!IsResultInvalidated.lookup(ID)) {
	++I;
	continue;
	}

	if (DebugLogging)
	dbgs() << "Invalidating analysis: " << this->lookUpPass(ID).name()
	<< "\n";

	I = ResultsList.erase(I);
	AnalysisResults.erase({ID, &IR});
	}
	}

	if (ResultsList.empty())
	AnalysisResultLists.erase(&IR);
	}

	private:
	/// \brief Look up a registered analysis pass.
	PassConceptT &lookUpPass(AnalysisKey *ID) {
	typename AnalysisPassMapT::iterator PI = AnalysisPasses.find(ID);
	assert(PI != AnalysisPasses.end() &&
	"Analysis passes must be registered prior to being queried!");
	return *PI->second;
	}

	/// \brief Look up a registered analysis pass.
	const PassConceptT &lookUpPass(AnalysisKey *ID) const {
	typename AnalysisPassMapT::const_iterator PI = AnalysisPasses.find(ID);
	assert(PI != AnalysisPasses.end() &&
	"Analysis passes must be registered prior to being queried!");
	return *PI->second;
	}

	/// \brief Get an analysis result, running the pass if necessary.
	ResultConceptT &getResultImpl(AnalysisKey *ID, IRUnitT &IR,
	ExtraArgTs... ExtraArgs) {
	typename AnalysisResultMapT::iterator RI;
	bool Inserted;
	std::tie(RI, Inserted) = AnalysisResults.insert(std::make_pair(
	std::make_pair(ID, &IR), typename AnalysisResultListT::iterator()));

	// If we don't have a cached result for this function, look up the pass and
	// run it to produce a result, which we then add to the cache.
	if (Inserted) {
	auto &P = this->lookUpPass(ID);
	if (DebugLogging)
	dbgs() << "Running analysis: " << P.name() << "\n";
	AnalysisResultListT &ResultList = AnalysisResultLists[&IR];
	ResultList.emplace_back(ID, P.run(IR, *this, ExtraArgs...));

	// P.run may have inserted elements into AnalysisResults and invalidated
	// RI.
	RI = AnalysisResults.find({ID, &IR});
	assert(RI != AnalysisResults.end() && "we just inserted it!");

	RI->second = std::prev(ResultList.end());
	}

	return *RI->second->second;
	}

	/// \brief Get a cached analysis result or return null.
	ResultConceptT getCachedResultImpl(AnalysisKey ID, IRUnitT &IR) const {
	typename AnalysisResultMapT::const_iterator RI =
	AnalysisResults.find({ID, &IR});
	return RI == AnalysisResults.end() ? nullptr : &*RI->second->second;
	}

	/// \brief Invalidate a function pass result.
	void invalidateImpl(AnalysisKey *ID, IRUnitT &IR) {
	typename AnalysisResultMapT::iterator RI =
	AnalysisResults.find({ID, &IR});
	if (RI == AnalysisResults.end())
	return;

	if (DebugLogging)
	dbgs() << "Invalidating analysis: " << this->lookUpPass(ID).name()
	<< "\n";
	AnalysisResultLists[&IR].erase(RI->second);
	AnalysisResults.erase(RI);
	}

	/// \brief Map type from module analysis pass ID to pass concept pointer.
	typedef DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>> AnalysisPassMapT;

	/// \brief Collection of module analysis passes, indexed by ID.
	AnalysisPassMapT AnalysisPasses;

	/// \brief Map from function to a list of function analysis results.
	///
	/// Provides linear time removal of all analysis results for a function and
	/// the ultimate storage for a particular cached analysis result.
	AnalysisResultListMapT AnalysisResultLists;

	/// \brief Map from an analysis ID and function to a particular cached
	/// analysis result.
	AnalysisResultMapT AnalysisResults;

	/// \brief Indicates whether we log to \c llvm::dbgs().
	bool DebugLogging;
	};

	extern template class AnalysisManager<Module>;
	/// \brief Convenience typedef for the Module analysis manager.
	typedef AnalysisManager<Module> ModuleAnalysisManager;

	extern template class AnalysisManager<Function>;
	/// \brief Convenience typedef for the Function analysis manager.
	typedef AnalysisManager<Function> FunctionAnalysisManager;

	/// \brief An analysis over an "outer" IR unit that provides access to an
	/// analysis manager over an "inner" IR unit. The inner unit must be contained
	/// in the outer unit.
	///
	/// Fore example, InnerAnalysisManagerProxy<FunctionAnalysisManager, Module> is
	/// an analysis over Modules (the "outer" unit) that provides access to a
	/// Function analysis manager. The FunctionAnalysisManager is the "inner"
	/// manager being proxied, and Functions are the "inner" unit. The inner/outer
	/// relationship is valid because each Function is contained in one Module.
	///
	/// If you're (transitively) within a pass manager for an IR unit U that
	/// contains IR unit V, you should never use an analysis manager over V, except
	/// via one of these proxies.
	///
	/// Note that the proxy's result is a move-only RAII object. The validity of
	/// the analyses in the inner analysis manager is tied to its lifetime.
	template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
	class InnerAnalysisManagerProxy
	: public AnalysisInfoMixin<
	InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>> {
	public:
	class Result {
	public:
	explicit Result(AnalysisManagerT &InnerAM) : InnerAM(&InnerAM) {}
	Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)) {
	// We have to null out the analysis manager in the moved-from state
	// because we are taking ownership of the responsibilty to clear the
	// analysis state.
	Arg.InnerAM = nullptr;
	}
	Result &operator=(Result &&RHS) {
	InnerAM = RHS.InnerAM;
	// We have to null out the analysis manager in the moved-from state
	// because we are taking ownership of the responsibilty to clear the
	// analysis state.
	RHS.InnerAM = nullptr;
	return *this;
	}
	~Result() {
	// InnerAM is cleared in a moved from state where there is nothing to do.
	if (!InnerAM)
	return;

	// Clear out the analysis manager if we're being destroyed -- it means we
	// didn't even see an invalidate call when we got invalidated.
	InnerAM->clear();
	}

	/// \brief Accessor for the analysis manager.
	AnalysisManagerT &getManager() { return *InnerAM; }

	/// \brief Handler for invalidation of the outer IR unit, \c IRUnitT.
	///
	/// If the proxy analysis itself is not preserved, we assume that the set of
	/// inner IR objects contained in IRUnit may have changed. In this case,
	/// we have to call \c clear() on the inner analysis manager, as it may now
	/// have stale pointers to its inner IR objects.
	///
	/// Regardless of whether the proxy analysis is marked as preserved, all of
	/// the analyses in the inner analysis manager are potentially invalidated
	/// based on the set of preserved analyses.
	bool invalidate(
	IRUnitT &IR, const PreservedAnalyses &PA,
	typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &Inv);

	private:
	AnalysisManagerT *InnerAM;
	};

	explicit InnerAnalysisManagerProxy(AnalysisManagerT &InnerAM)
	: InnerAM(&InnerAM) {}

	/// \brief Run the analysis pass and create our proxy result object.
	///
	/// This doesn't do any interesting work; it is primarily used to insert our
	/// proxy result object into the outer analysis cache so that we can proxy
	/// invalidation to the inner analysis manager.
	Result run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
	ExtraArgTs...) {
	return Result(*InnerAM);
	}

	private:
	friend AnalysisInfoMixin<
	InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
	static AnalysisKey Key;

	AnalysisManagerT *InnerAM;
	};

	template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
	AnalysisKey
	InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;

	/// Provide the \c FunctionAnalysisManager to \c Module proxy.
	typedef InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>
	FunctionAnalysisManagerModuleProxy;

	/// Specialization of the invalidate method for the \c
	/// FunctionAnalysisManagerModuleProxy's result.
	template <>
	bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
	Module &M, const PreservedAnalyses &PA,
	ModuleAnalysisManager::Invalidator &Inv);

	// Ensure the \c FunctionAnalysisManagerModuleProxy is provided as an extern
	// template.
	extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
	Module>;

	/// \brief An analysis over an "inner" IR unit that provides access to an
	/// analysis manager over a "outer" IR unit. The inner unit must be contained
	/// in the outer unit.
	///
	/// For example OuterAnalysisManagerProxy<ModuleAnalysisManager, Function> is an
	/// analysis over Functions (the "inner" unit) which provides access to a Module
	/// analysis manager. The ModuleAnalysisManager is the "outer" manager being
	/// proxied, and Modules are the "outer" IR unit. The inner/outer relationship
	/// is valid because each Function is contained in one Module.
	///
	/// This proxy only exposes the const interface of the outer analysis manager,
	/// to indicate that you cannot cause an outer analysis to run from within an
	/// inner pass. Instead, you must rely on the \c getCachedResult API.
	///
	/// This proxy doesn't manage invalidation in any way -- that is handled by the
	/// recursive return path of each layer of the pass manager. A consequence of
	/// this is the outer analyses may be stale. We invalidate the outer analyses
	/// only when we're done running passes over the inner IR units.
	template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
	class OuterAnalysisManagerProxy
	: public AnalysisInfoMixin<
	- OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>> {
	+ OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>> {
	public:
	/// \brief Result proxy object for \c OuterAnalysisManagerProxy.
	class Result {
	public:
	explicit Result(const AnalysisManagerT &AM) : AM(&AM) {}

	const AnalysisManagerT &getManager() const { return *AM; }

	/// \brief Handle invalidation by ignoring it; this pass is immutable.
	bool invalidate(
	IRUnitT &, const PreservedAnalyses &,
	typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &) {
	return false;
	}

	/// Register a deferred invalidation event for when the outer analysis
	/// manager processes its invalidations.
	template <typename OuterAnalysisT, typename InvalidatedAnalysisT>
	void registerOuterAnalysisInvalidation() {
	AnalysisKey *OuterID = OuterAnalysisT::ID();
	AnalysisKey *InvalidatedID = InvalidatedAnalysisT::ID();

	auto &InvalidatedIDList = OuterAnalysisInvalidationMap[OuterID];
	// Note, this is a linear scan. If we end up with large numbers of
	// analyses that all trigger invalidation on the same outer analysis,
	// this entire system should be changed to some other deterministic
	// data structure such as a `SetVector` of a pair of pointers.
	auto InvalidatedIt = std::find(InvalidatedIDList.begin(),
	InvalidatedIDList.end(), InvalidatedID);
	if (InvalidatedIt == InvalidatedIDList.end())
	InvalidatedIDList.push_back(InvalidatedID);
	}

	/// Access the map from outer analyses to deferred invalidation requiring
	/// analyses.
	const SmallDenseMap<AnalysisKey , TinyPtrVector<AnalysisKey >, 2> &
	getOuterInvalidations() const {
	return OuterAnalysisInvalidationMap;
	}

	private:
	const AnalysisManagerT *AM;

	/// A map from an outer analysis ID to the set of this IR-unit's analyses
	/// which need to be invalidated.
	SmallDenseMap<AnalysisKey , TinyPtrVector<AnalysisKey >, 2>
	OuterAnalysisInvalidationMap;
	};

	OuterAnalysisManagerProxy(const AnalysisManagerT &AM) : AM(&AM) {}

	/// \brief Run the analysis pass and create our proxy result object.
	/// Nothing to see here, it just forwards the \c AM reference into the
	/// result.
	Result run(IRUnitT &, AnalysisManager<IRUnitT, ExtraArgTs...> &,
	ExtraArgTs...) {
	return Result(*AM);
	}

	private:
	friend AnalysisInfoMixin<
	- OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
	+ OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>>;
	static AnalysisKey Key;

	const AnalysisManagerT *AM;
	};

	template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
	AnalysisKey
	OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;

	extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
	Function>;
	/// Provide the \c ModuleAnalysisManager to \c Function proxy.
	typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
	ModuleAnalysisManagerFunctionProxy;

	/// \brief Trivial adaptor that maps from a module to its functions.
	///
	/// Designed to allow composition of a FunctionPass(Manager) and
	/// a ModulePassManager, by running the FunctionPass(Manager) over every
	/// function in the module.
	///
	/// Function passes run within this adaptor can rely on having exclusive access
	/// to the function they are run over. They should not read or modify any other
	/// functions! Other threads or systems may be manipulating other functions in
	/// the module, and so their state should never be relied on.
	/// FIXME: Make the above true for all of LLVM's actual passes, some still
	/// violate this principle.
	///
	/// Function passes can also read the module containing the function, but they
	/// should not modify that module outside of the use lists of various globals.
	/// For example, a function pass is not permitted to add functions to the
	/// module.
	/// FIXME: Make the above true for all of LLVM's actual passes, some still
	/// violate this principle.
	///
	/// Note that although function passes can access module analyses, module
	/// analyses are not invalidated while the function passes are running, so they
	/// may be stale. Function analyses will not be stale.
	template <typename FunctionPassT>
	class ModuleToFunctionPassAdaptor
	: public PassInfoMixin<ModuleToFunctionPassAdaptor<FunctionPassT>> {
	public:
	explicit ModuleToFunctionPassAdaptor(FunctionPassT Pass)
	: Pass(std::move(Pass)) {}

	/// \brief Runs the function pass across every function in the module.
	PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
	FunctionAnalysisManager &FAM =
	AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();

	PreservedAnalyses PA = PreservedAnalyses::all();
	for (Function &F : M) {
	if (F.isDeclaration())
	continue;

	PreservedAnalyses PassPA = Pass.run(F, FAM);

	// We know that the function pass couldn't have invalidated any other
	// function's analyses (that's the contract of a function pass), so
	// directly handle the function analysis manager's invalidation here.
	FAM.invalidate(F, PassPA);

	// Then intersect the preserved set so that invalidation of module
	// analyses will eventually occur when the module pass completes.
	PA.intersect(std::move(PassPA));
	}

	// The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
	// the function passes we ran didn't add or remove any functions.
	//
	// We also preserve all analyses on Functions, because we did all the
	// invalidation we needed to do above.
	PA.preserveSet<AllAnalysesOn<Function>>();
	PA.preserve<FunctionAnalysisManagerModuleProxy>();
	return PA;
	}

	private:
	FunctionPassT Pass;
	};

	/// \brief A function to deduce a function pass type and wrap it in the
	/// templated adaptor.
	template <typename FunctionPassT>
	ModuleToFunctionPassAdaptor<FunctionPassT>
	createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
	return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
	}

	/// \brief A utility pass template to force an analysis result to be available.
	///
	/// If there are extra arguments at the pass's run level there may also be
	/// extra arguments to the analysis manager's \c getResult routine. We can't
	/// guess how to effectively map the arguments from one to the other, and so
	/// this specialization just ignores them.
	///
	/// Specific patterns of run-method extra arguments and analysis manager extra
	/// arguments will have to be defined as appropriate specializations.
	template <typename AnalysisT, typename IRUnitT,
	typename AnalysisManagerT = AnalysisManager<IRUnitT>,
	typename... ExtraArgTs>
	struct RequireAnalysisPass
	: PassInfoMixin<RequireAnalysisPass<AnalysisT, IRUnitT, AnalysisManagerT,
	ExtraArgTs...>> {
	/// \brief Run this pass over some unit of IR.
	///
	/// This pass can be run over any unit of IR and use any analysis manager
	/// provided they satisfy the basic API requirements. When this pass is
	/// created, these methods can be instantiated to satisfy whatever the
	/// context requires.
	PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM,
	ExtraArgTs &&... Args) {
	(void)AM.template getResult<AnalysisT>(Arg,
	std::forward<ExtraArgTs>(Args)...);

	return PreservedAnalyses::all();
	}
	};

	/// \brief A no-op pass template which simply forces a specific analysis result
	/// to be invalidated.
	template <typename AnalysisT>
	struct InvalidateAnalysisPass
	: PassInfoMixin<InvalidateAnalysisPass<AnalysisT>> {
	/// \brief Run this pass over some unit of IR.
	///
	/// This pass can be run over any unit of IR and use any analysis manager,
	/// provided they satisfy the basic API requirements. When this pass is
	/// created, these methods can be instantiated to satisfy whatever the
	/// context requires.
	template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
	PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM, ExtraArgTs &&...) {
	auto PA = PreservedAnalyses::all();
	PA.abandon<AnalysisT>();
	return PA;
	}
	};

	/// \brief A utility pass that does nothing, but preserves no analyses.
	///
	/// Because this preserves no analyses, any analysis passes queried after this
	/// pass runs will recompute fresh results.
	struct InvalidateAllAnalysesPass : PassInfoMixin<InvalidateAllAnalysesPass> {
	/// \brief Run this pass over some unit of IR.
	template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
	PreservedAnalyses run(IRUnitT &, AnalysisManagerT &, ExtraArgTs &&...) {
	return PreservedAnalyses::none();
	}
	};

	/// A utility pass template that simply runs another pass multiple times.
	///
	/// This can be useful when debugging or testing passes. It also serves as an
	/// example of how to extend the pass manager in ways beyond composition.
	template <typename PassT>
	class RepeatedPass : public PassInfoMixin<RepeatedPass<PassT>> {
	public:
	RepeatedPass(int Count, PassT P) : Count(Count), P(std::move(P)) {}

	template <typename IRUnitT, typename AnalysisManagerT, typename... Ts>
	PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM, Ts &&... Args) {
	auto PA = PreservedAnalyses::all();
	for (int i = 0; i < Count; ++i)
	PA.intersect(P.run(Arg, AM, std::forward<Ts>(Args)...));
	return PA;
	}

	private:
	int Count;
	PassT P;
	};

	template <typename PassT>
	RepeatedPass<PassT> createRepeatedPass(int Count, PassT P) {
	return RepeatedPass<PassT>(Count, std::move(P));
	}

	}

	#endif
	Index: projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h (revision 313642)
	+++ projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h (revision 313643)
	@@ -1,1548 +1,1529 @@
	//===-- llvm/Target/TargetInstrInfo.h - Instruction Info --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the target machine instruction set to the code generator.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_TARGET_TARGETINSTRINFO_H
	#define LLVM_TARGET_TARGETINSTRINFO_H

	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/CodeGen/MachineCombinerPattern.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/MC/MCInstrInfo.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/CodeGen/LiveIntervalAnalysis.h"

	namespace llvm {

	class InstrItineraryData;
	class LiveVariables;
	class MCAsmInfo;
	class MachineMemOperand;
	class MachineRegisterInfo;
	class MDNode;
	class MCInst;
	struct MCSchedModel;
	class MCSymbolRefExpr;
	class SDNode;
	class ScheduleHazardRecognizer;
	class SelectionDAG;
	class ScheduleDAG;
	class TargetRegisterClass;
	class TargetRegisterInfo;
	class TargetSubtargetInfo;
	class TargetSchedModel;
	class DFAPacketizer;

	template<class T> class SmallVectorImpl;

	//---------------------------------------------------------------------------
	///
	/// TargetInstrInfo - Interface to description of machine instruction set
	///
	class TargetInstrInfo : public MCInstrInfo {
	TargetInstrInfo(const TargetInstrInfo &) = delete;
	void operator=(const TargetInstrInfo &) = delete;
	public:
	TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
	unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u)
	: CallFrameSetupOpcode(CFSetupOpcode),
	CallFrameDestroyOpcode(CFDestroyOpcode),
	CatchRetOpcode(CatchRetOpcode),
	ReturnOpcode(ReturnOpcode) {}

	virtual ~TargetInstrInfo();

	static bool isGenericOpcode(unsigned Opc) {
	return Opc <= TargetOpcode::GENERIC_OP_END;
	}

	/// Given a machine instruction descriptor, returns the register
	/// class constraint for OpNum, or NULL.
	const TargetRegisterClass *getRegClass(const MCInstrDesc &TID,
	unsigned OpNum,
	const TargetRegisterInfo *TRI,
	const MachineFunction &MF) const;

	/// Return true if the instruction is trivially rematerializable, meaning it
	/// has no side effects and requires no operands that aren't always available.
	/// This means the only allowed uses are constants and unallocatable physical
	/// registers so that the instructions result is independent of the place
	/// in the function.
	bool isTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA = nullptr) const {
	return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF \|\|
	(MI.getDesc().isRematerializable() &&
	(isReallyTriviallyReMaterializable(MI, AA) \|\|
	isReallyTriviallyReMaterializableGeneric(MI, AA)));
	}

	protected:
	/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
	/// set, this hook lets the target specify whether the instruction is actually
	/// trivially rematerializable, taking into consideration its operands. This
	/// predicate must return false if the instruction has any side effects other
	/// than producing a value, or if it requres any address registers that are
	/// not always available.
	/// Requirements must be check as stated in isTriviallyReMaterializable() .
	virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const {
	return false;
	}

	/// This method commutes the operands of the given machine instruction MI.
	/// The operands to be commuted are specified by their indices OpIdx1 and
	/// OpIdx2.
	///
	/// If a target has any instructions that are commutable but require
	/// converting to different instructions or making non-trivial changes
	/// to commute them, this method can be overloaded to do that.
	/// The default implementation simply swaps the commutable operands.
	///
	/// If NewMI is false, MI is modified in place and returned; otherwise, a
	/// new machine instruction is created and returned.
	///
	/// Do not call this method for a non-commutable instruction.
	/// Even though the instruction is commutable, the method may still
	/// fail to commute the operands, null pointer is returned in such cases.
	virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned OpIdx1,
	unsigned OpIdx2) const;

	/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
	/// operand indices to (ResultIdx1, ResultIdx2).
	/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
	/// predefined to some indices or be undefined (designated by the special
	/// value 'CommuteAnyOperandIndex').
	/// The predefined result indices cannot be re-defined.
	/// The function returns true iff after the result pair redefinition
	/// the fixed result pair is equal to or equivalent to the source pair of
	/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
	/// the pairs (x,y) and (y,x) are equivalent.
	static bool fixCommutedOpIndices(unsigned &ResultIdx1,
	unsigned &ResultIdx2,
	unsigned CommutableOpIdx1,
	unsigned CommutableOpIdx2);

	private:
	/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
	/// set and the target hook isReallyTriviallyReMaterializable returns false,
	/// this function does target-independent tests to determine if the
	/// instruction is really trivially rematerializable.
	bool isReallyTriviallyReMaterializableGeneric(const MachineInstr &MI,
	AliasAnalysis *AA) const;

	public:
	/// These methods return the opcode of the frame setup/destroy instructions
	/// if they exist (-1 otherwise). Some targets use pseudo instructions in
	/// order to abstract away the difference between operating with a frame
	/// pointer and operating without, through the use of these two instructions.
	///
	unsigned getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
	unsigned getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }

	unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
	unsigned getReturnOpcode() const { return ReturnOpcode; }

	/// Returns the actual stack pointer adjustment made by an instruction
	/// as part of a call sequence. By default, only call frame setup/destroy
	/// instructions adjust the stack, but targets may want to override this
	/// to enable more fine-grained adjustment, or adjust by a different value.
	virtual int getSPAdjust(const MachineInstr &MI) const;

	/// Return true if the instruction is a "coalescable" extension instruction.
	/// That is, it's like a copy where it's legal for the source to overlap the
	/// destination. e.g. X86::MOVSX64rr32. If this returns true, then it's
	/// expected the pre-extension value is available as a subreg of the result
	/// register. This also returns the sub-register index in SubIdx.
	virtual bool isCoalescableExtInstr(const MachineInstr &MI,
	unsigned &SrcReg, unsigned &DstReg,
	unsigned &SubIdx) const {
	return false;
	}

	/// If the specified machine instruction is a direct
	/// load from a stack slot, return the virtual or physical register number of
	/// the destination along with the FrameIndex of the loaded stack slot. If
	/// not, return 0. This predicate must return 0 if the instruction has
	/// any side effects other than loading from the stack slot.
	virtual unsigned isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// Check for post-frame ptr elimination stack locations as well.
	/// This uses a heuristic so it isn't reliable for correctness.
	virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// If the specified machine instruction has a load from a stack slot,
	/// return true along with the FrameIndex of the loaded stack slot and the
	/// machine mem operand containing the reference.
	/// If not, return false. Unlike isLoadFromStackSlot, this returns true for
	/// any instructions that loads from the stack. This is just a hint, as some
	/// cases may be missed.
	virtual bool hasLoadFromStackSlot(const MachineInstr &MI,
	const MachineMemOperand *&MMO,
	int &FrameIndex) const;

	/// If the specified machine instruction is a direct
	/// store to a stack slot, return the virtual or physical register number of
	/// the source reg along with the FrameIndex of the loaded stack slot. If
	/// not, return 0. This predicate must return 0 if the instruction has
	/// any side effects other than storing to the stack slot.
	virtual unsigned isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// Check for post-frame ptr elimination stack locations as well.
	/// This uses a heuristic, so it isn't reliable for correctness.
	virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	return 0;
	}

	/// If the specified machine instruction has a store to a stack slot,
	/// return true along with the FrameIndex of the loaded stack slot and the
	/// machine mem operand containing the reference.
	/// If not, return false. Unlike isStoreToStackSlot,
	/// this returns true for any instructions that stores to the
	/// stack. This is just a hint, as some cases may be missed.
	virtual bool hasStoreToStackSlot(const MachineInstr &MI,
	const MachineMemOperand *&MMO,
	int &FrameIndex) const;

	/// Return true if the specified machine instruction
	/// is a copy of one stack slot to another and has no other effect.
	/// Provide the identity of the two frame indices.
	virtual bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
	int &SrcFrameIndex) const {
	return false;
	}

	/// Compute the size in bytes and offset within a stack slot of a spilled
	/// register or subregister.
	///
	/// \param [out] Size in bytes of the spilled value.
	/// \param [out] Offset in bytes within the stack slot.
	/// \returns true if both Size and Offset are successfully computed.
	///
	/// Not all subregisters have computable spill slots. For example,
	/// subregisters registers may not be byte-sized, and a pair of discontiguous
	/// subregisters has no single offset.
	///
	/// Targets with nontrivial bigendian implementations may need to override
	/// this, particularly to support spilled vector registers.
	virtual bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
	unsigned &Size, unsigned &Offset,
	const MachineFunction &MF) const;

	/// Returns the size in bytes of the specified MachineInstr, or ~0U
	/// when this function is not implemented by a target.
	virtual unsigned getInstSizeInBytes(const MachineInstr &MI) const {
	return ~0U;
	}

	/// Return true if the instruction is as cheap as a move instruction.
	///
	/// Targets for different archs need to override this, and different
	/// micro-architectures can also be finely tuned inside.
	virtual bool isAsCheapAsAMove(const MachineInstr &MI) const {
	return MI.isAsCheapAsAMove();
	}

	/// Return true if the instruction should be sunk by MachineSink.
	///
	/// MachineSink determines on its own whether the instruction is safe to sink;
	/// this gives the target a hook to override the default behavior with regards
	/// to which instructions should be sunk.
	virtual bool shouldSink(const MachineInstr &MI) const {
	return true;
	}

	/// Re-issue the specified 'original' instruction at the
	/// specific location targeting a new destination register.
	/// The register in Orig->getOperand(0).getReg() will be substituted by
	/// DestReg:SubIdx. Any existing subreg index is preserved or composed with
	/// SubIdx.
	virtual void reMaterialize(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI, unsigned DestReg,
	unsigned SubIdx, const MachineInstr &Orig,
	const TargetRegisterInfo &TRI) const;

	/// Create a duplicate of the Orig instruction in MF. This is like
	/// MachineFunction::CloneMachineInstr(), but the target may update operands
	/// that are required to be unique.
	///
	/// The instruction must be duplicable as indicated by isNotDuplicable().
	virtual MachineInstr *duplicate(MachineInstr &Orig,
	MachineFunction &MF) const;

	/// This method must be implemented by targets that
	/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
	/// may be able to convert a two-address instruction into one or more true
	/// three-address instructions on demand. This allows the X86 target (for
	/// example) to convert ADD and SHL instructions into LEA instructions if they
	/// would require register copies due to two-addressness.
	///
	/// This method returns a null pointer if the transformation cannot be
	/// performed, otherwise it returns the last new instruction.
	///
	virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
	MachineInstr &MI,
	LiveVariables *LV) const {
	return nullptr;
	}

	// This constant can be used as an input value of operand index passed to
	// the method findCommutedOpIndices() to tell the method that the
	// corresponding operand index is not pre-defined and that the method
	// can pick any commutable operand.
	static const unsigned CommuteAnyOperandIndex = ~0U;

	/// This method commutes the operands of the given machine instruction MI.
	///
	/// The operands to be commuted are specified by their indices OpIdx1 and
	/// OpIdx2. OpIdx1 and OpIdx2 arguments may be set to a special value
	/// 'CommuteAnyOperandIndex', which means that the method is free to choose
	/// any arbitrarily chosen commutable operand. If both arguments are set to
	/// 'CommuteAnyOperandIndex' then the method looks for 2 different commutable
	/// operands; then commutes them if such operands could be found.
	///
	/// If NewMI is false, MI is modified in place and returned; otherwise, a
	/// new machine instruction is created and returned.
	///
	/// Do not call this method for a non-commutable instruction or
	/// for non-commuable operands.
	/// Even though the instruction is commutable, the method may still
	/// fail to commute the operands, null pointer is returned in such cases.
	MachineInstr *
	commuteInstruction(MachineInstr &MI, bool NewMI = false,
	unsigned OpIdx1 = CommuteAnyOperandIndex,
	unsigned OpIdx2 = CommuteAnyOperandIndex) const;

	/// Returns true iff the routine could find two commutable operands in the
	/// given machine instruction.
	/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
	/// If any of the INPUT values is set to the special value
	/// 'CommuteAnyOperandIndex' then the method arbitrarily picks a commutable
	/// operand, then returns its index in the corresponding argument.
	/// If both of INPUT values are set to 'CommuteAnyOperandIndex' then method
	/// looks for 2 commutable operands.
	/// If INPUT values refer to some operands of MI, then the method simply
	/// returns true if the corresponding operands are commutable and returns
	/// false otherwise.
	///
	/// For example, calling this method this way:
	/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
	/// findCommutedOpIndices(MI, Op1, Op2);
	/// can be interpreted as a query asking to find an operand that would be
	/// commutable with the operand#1.
	virtual bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const;

	/// A pair composed of a register and a sub-register index.
	/// Used to give some type checking when modeling Reg:SubReg.
	struct RegSubRegPair {
	unsigned Reg;
	unsigned SubReg;
	RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
	: Reg(Reg), SubReg(SubReg) {}
	};
	/// A pair composed of a pair of a register and a sub-register index,
	/// and another sub-register index.
	/// Used to give some type checking when modeling Reg:SubReg1, SubReg2.
	struct RegSubRegPairAndIdx : RegSubRegPair {
	unsigned SubIdx;
	RegSubRegPairAndIdx(unsigned Reg = 0, unsigned SubReg = 0,
	unsigned SubIdx = 0)
	: RegSubRegPair(Reg, SubReg), SubIdx(SubIdx) {}
	};

	/// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
	/// and \p DefIdx.
	/// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
	/// the list is modeled as <Reg:SubReg, SubIdx>.
	/// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
	/// two elements:
	/// - vreg1:sub1, sub0
	/// - vreg2<:0>, sub1
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isRegSequence() or MI.isRegSequenceLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isRegSequenceLike(). In other words, one has to override
	/// getRegSequenceLikeInputs for target specific instructions.
	bool
	getRegSequenceInputs(const MachineInstr &MI, unsigned DefIdx,
	SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const;

	/// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
	/// and \p DefIdx.
	/// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
	/// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
	/// - vreg1:sub1, sub0
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isExtractSubreg() or MI.isExtractSubregLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isExtractSubregLike(). In other words, one has to override
	/// getExtractSubregLikeInputs for target specific instructions.
	bool
	getExtractSubregInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPairAndIdx &InputReg) const;

	/// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
	/// and \p DefIdx.
	/// \p [out] BaseReg and \p [out] InsertedReg contain
	/// the equivalent inputs of INSERT_SUBREG.
	/// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
	/// - BaseReg: vreg0:sub0
	/// - InsertedReg: vreg1:sub1, sub3
	///
	/// \returns true if it is possible to build such an input sequence
	/// with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isInsertSubreg() or MI.isInsertSubregLike().
	///
	/// \note The generic implementation does not provide any support for
	/// MI.isInsertSubregLike(). In other words, one has to override
	/// getInsertSubregLikeInputs for target specific instructions.
	bool
	getInsertSubregInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPair &BaseReg,
	RegSubRegPairAndIdx &InsertedReg) const;


	/// Return true if two machine instructions would produce identical values.
	/// By default, this is only true when the two instructions
	/// are deemed identical except for defs. If this function is called when the
	/// IR is still in SSA form, the caller can pass the MachineRegisterInfo for
	/// aggressive checks.
	virtual bool produceSameValue(const MachineInstr &MI0,
	const MachineInstr &MI1,
	const MachineRegisterInfo *MRI = nullptr) const;

	/// \returns true if a branch from an instruction with opcode \p BranchOpc
	/// bytes is capable of jumping to a position \p BrOffset bytes away.
	virtual bool isBranchOffsetInRange(unsigned BranchOpc,
	int64_t BrOffset) const {
	llvm_unreachable("target did not implement");
	}

	/// \returns The block that branch instruction \p MI jumps to.
	virtual MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const {
	llvm_unreachable("target did not implement");
	}

	/// Insert an unconditional indirect branch at the end of \p MBB to \p
	/// NewDestBB. \p BrOffset indicates the offset of \p NewDestBB relative to
	/// the offset of the position to insert the new branch.
	///
	/// \returns The number of bytes added to the block.
	virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
	MachineBasicBlock &NewDestBB,
	const DebugLoc &DL,
	int64_t BrOffset = 0,
	RegScavenger *RS = nullptr) const {
	llvm_unreachable("target did not implement");
	}

	/// Analyze the branching code at the end of MBB, returning
	/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
	/// implemented for a target). Upon success, this returns false and returns
	/// with the following information in various cases:
	///
	/// 1. If this block ends with no branches (it just falls through to its succ)
	/// just return false, leaving TBB/FBB null.
	/// 2. If this block ends with only an unconditional branch, it sets TBB to be
	/// the destination block.
	/// 3. If this block ends with a conditional branch and it falls through to a
	/// successor block, it sets TBB to be the branch destination block and a
	/// list of operands that evaluate the condition. These operands can be
	/// passed to other TargetInstrInfo methods to create new branches.
	/// 4. If this block ends with a conditional branch followed by an
	/// unconditional branch, it returns the 'true' destination in TBB, the
	/// 'false' destination in FBB, and a list of operands that evaluate the
	/// condition. These operands can be passed to other TargetInstrInfo
	/// methods to create new branches.
	///
	/// Note that removeBranch and insertBranch must be implemented to support
	/// cases where this method returns success.
	///
	/// If AllowModify is true, then this routine is allowed to modify the basic
	/// block (e.g. delete instructions after the unconditional branch).
	///
	/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
	/// before calling this function.
	virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify = false) const {
	return true;
	}

	/// Represents a predicate at the MachineFunction level. The control flow a
	/// MachineBranchPredicate represents is:
	///
	/// Reg <def>= LHS `Predicate` RHS == ConditionDef
	/// if Reg then goto TrueDest else goto FalseDest
	///
	struct MachineBranchPredicate {
	enum ComparePredicate {
	PRED_EQ, // True if two values are equal
	PRED_NE, // True if two values are not equal
	PRED_INVALID // Sentinel value
	};

	ComparePredicate Predicate;
	MachineOperand LHS;
	MachineOperand RHS;
	MachineBasicBlock *TrueDest;
	MachineBasicBlock *FalseDest;
	MachineInstr *ConditionDef;

	/// SingleUseCondition is true if ConditionDef is dead except for the
	/// branch(es) at the end of the basic block.
	///
	bool SingleUseCondition;

	explicit MachineBranchPredicate()
	: Predicate(PRED_INVALID), LHS(MachineOperand::CreateImm(0)),
	RHS(MachineOperand::CreateImm(0)), TrueDest(nullptr),
	FalseDest(nullptr), ConditionDef(nullptr), SingleUseCondition(false) {
	}
	};

	/// Analyze the branching code at the end of MBB and parse it into the
	/// MachineBranchPredicate structure if possible. Returns false on success
	/// and true on failure.
	///
	/// If AllowModify is true, then this routine is allowed to modify the basic
	/// block (e.g. delete instructions after the unconditional branch).
	///
	virtual bool analyzeBranchPredicate(MachineBasicBlock &MBB,
	MachineBranchPredicate &MBP,
	bool AllowModify = false) const {
	return true;
	}

	/// Remove the branching code at the end of the specific MBB.
	/// This is only invoked in cases where AnalyzeBranch returns success. It
	/// returns the number of instructions that were removed.
	/// If \p BytesRemoved is non-null, report the change in code size from the
	/// removed instructions.
	virtual unsigned removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved = nullptr) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::removeBranch!");
	}

	/// Insert branch code into the end of the specified MachineBasicBlock. The
	/// operands to this method are the same as those returned by AnalyzeBranch.
	/// This is only invoked in cases where AnalyzeBranch returns success. It
	/// returns the number of instructions inserted. If \p BytesAdded is non-null,
	/// report the change in code size from the added instructions.
	///
	/// It is also invoked by tail merging to add unconditional branches in
	/// cases where AnalyzeBranch doesn't apply because there was no original
	/// branch to analyze. At least this much must be implemented, else tail
	/// merging needs to be disabled.
	///
	/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
	/// before calling this function.
	virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded = nullptr) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::insertBranch!");
	}

	unsigned insertUnconditionalBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *DestBB,
	const DebugLoc &DL,
	int *BytesAdded = nullptr) const {
	return insertBranch(MBB, DestBB, nullptr,
	ArrayRef<MachineOperand>(), DL, BytesAdded);
	}

	/// Analyze the loop code, return true if it cannot be understoo. Upon
	/// success, this function returns false and returns information about the
	/// induction variable and compare instruction used at the end.
	virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
	MachineInstr *&CmpInst) const {
	return true;
	}

	/// Generate code to reduce the loop iteration by one and check if the loop is
	/// finished. Return the value/register of the the new loop count. We need
	/// this function when peeling off one or more iterations of a loop. This
	/// function assumes the nth iteration is peeled first.
	virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
	MachineInstr *IndVar, MachineInstr &Cmp,
	SmallVectorImpl<MachineOperand> &Cond,
	SmallVectorImpl<MachineInstr *> &PrevInsts,
	unsigned Iter, unsigned MaxIter) const {
	llvm_unreachable("Target didn't implement ReduceLoopCount");
	}

	/// Delete the instruction OldInst and everything after it, replacing it with
	/// an unconditional branch to NewDest. This is used by the tail merging pass.
	virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
	MachineBasicBlock *NewDest) const;

	/// Return true if it's legal to split the given basic
	/// block at the specified instruction (i.e. instruction would be the start
	/// of a new basic block).
	virtual bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) const {
	return true;
	}

	/// Return true if it's profitable to predicate
	/// instructions with accumulated instruction latency of "NumCycles"
	/// of the specified basic block, where the probability of the instructions
	/// being executed is given by Probability, and Confidence is a measure
	/// of our confidence that it will be properly predicted.
	virtual
	bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
	unsigned ExtraPredCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Second variant of isProfitableToIfCvt. This one
	/// checks for the case where two basic blocks from true and false path
	/// of a if-then-else (diamond) are predicated on mutally exclusive
	/// predicates, where the probability of the true path being taken is given
	/// by Probability, and Confidence is a measure of our confidence that it
	/// will be properly predicted.
	virtual bool
	isProfitableToIfCvt(MachineBasicBlock &TMBB,
	unsigned NumTCycles, unsigned ExtraTCycles,
	MachineBasicBlock &FMBB,
	unsigned NumFCycles, unsigned ExtraFCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Return true if it's profitable for if-converter to duplicate instructions
	/// of specified accumulated instruction latencies in the specified MBB to
	/// enable if-conversion.
	/// The probability of the instructions being executed is given by
	/// Probability, and Confidence is a measure of our confidence that it
	/// will be properly predicted.
	virtual bool
	isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
	BranchProbability Probability) const {
	return false;
	}

	/// Return true if it's profitable to unpredicate
	/// one side of a 'diamond', i.e. two sides of if-else predicated on mutually
	/// exclusive predicates.
	/// e.g.
	/// subeq r0, r1, #1
	/// addne r0, r1, #1
	/// =>
	/// sub r0, r1, #1
	/// addne r0, r1, #1
	///
	/// This may be profitable is conditional instructions are always executed.
	virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
	MachineBasicBlock &FMBB) const {
	return false;
	}

	/// Return true if it is possible to insert a select
	/// instruction that chooses between TrueReg and FalseReg based on the
	/// condition code in Cond.
	///
	/// When successful, also return the latency in cycles from TrueReg,
	/// FalseReg, and Cond to the destination register. In most cases, a select
	/// instruction will be 1 cycle, so CondCycles = TrueCycles = FalseCycles = 1
	///
	/// Some x86 implementations have 2-cycle cmov instructions.
	///
	/// @param MBB Block where select instruction would be inserted.
	/// @param Cond Condition returned by AnalyzeBranch.
	/// @param TrueReg Virtual register to select when Cond is true.
	/// @param FalseReg Virtual register to select when Cond is false.
	/// @param CondCycles Latency from Cond+Branch to select output.
	/// @param TrueCycles Latency from TrueReg to select output.
	/// @param FalseCycles Latency from FalseReg to select output.
	virtual bool canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg,
	int &CondCycles,
	int &TrueCycles, int &FalseCycles) const {
	return false;
	}

	/// Insert a select instruction into MBB before I that will copy TrueReg to
	/// DstReg when Cond is true, and FalseReg to DstReg when Cond is false.
	///
	/// This function can only be called after canInsertSelect() returned true.
	/// The condition in Cond comes from AnalyzeBranch, and it can be assumed
	/// that the same flags or registers required by Cond are available at the
	/// insertion point.
	///
	/// @param MBB Block where select instruction should be inserted.
	/// @param I Insertion point.
	/// @param DL Source location for debugging.
	/// @param DstReg Virtual register to be defined by select instruction.
	/// @param Cond Condition as computed by AnalyzeBranch.
	/// @param TrueReg Virtual register to copy when Cond is true.
	/// @param FalseReg Virtual register to copy when Cons is false.
	virtual void insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I, const DebugLoc &DL,
	unsigned DstReg, ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::insertSelect!");
	}

	/// Analyze the given select instruction, returning true if
	/// it cannot be understood. It is assumed that MI->isSelect() is true.
	///
	/// When successful, return the controlling condition and the operands that
	/// determine the true and false result values.
	///
	/// Result = SELECT Cond, TrueOp, FalseOp
	///
	/// Some targets can optimize select instructions, for example by predicating
	/// the instruction defining one of the operands. Such targets should set
	/// Optimizable.
	///
	/// @param MI Select instruction to analyze.
	/// @param Cond Condition controlling the select.
	/// @param TrueOp Operand number of the value selected when Cond is true.
	/// @param FalseOp Operand number of the value selected when Cond is false.
	/// @param Optimizable Returned as true if MI is optimizable.
	/// @returns False on success.
	virtual bool analyzeSelect(const MachineInstr &MI,
	SmallVectorImpl<MachineOperand> &Cond,
	unsigned &TrueOp, unsigned &FalseOp,
	bool &Optimizable) const {
	assert(MI.getDesc().isSelect() && "MI must be a select instruction");
	return true;
	}

	/// Given a select instruction that was understood by
	/// analyzeSelect and returned Optimizable = true, attempt to optimize MI by
	/// merging it with one of its operands. Returns NULL on failure.
	///
	/// When successful, returns the new select instruction. The client is
	/// responsible for deleting MI.
	///
	/// If both sides of the select can be optimized, PreferFalse is used to pick
	/// a side.
	///
	/// @param MI Optimizable select instruction.
	/// @param NewMIs Set that record all MIs in the basic block up to \p
	/// MI. Has to be updated with any newly created MI or deleted ones.
	/// @param PreferFalse Try to optimize FalseOp instead of TrueOp.
	/// @returns Optimized instruction or NULL.
	virtual MachineInstr *optimizeSelect(MachineInstr &MI,
	SmallPtrSetImpl<MachineInstr *> &NewMIs,
	bool PreferFalse = false) const {
	// This function must be implemented if Optimizable is ever set.
	llvm_unreachable("Target must implement TargetInstrInfo::optimizeSelect!");
	}

	/// Emit instructions to copy a pair of physical registers.
	///
	/// This function should support copies within any legal register class as
	/// well as any cross-class copies created during instruction selection.
	///
	/// The source and destination registers may overlap, which may require a
	/// careful implementation when multiple copy instructions are required for
	/// large registers. See for example the ARM target.
	virtual void copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI, const DebugLoc &DL,
	unsigned DestReg, unsigned SrcReg,
	bool KillSrc) const {
	llvm_unreachable("Target didn't implement TargetInstrInfo::copyPhysReg!");
	}

	/// Store the specified register of the given register class to the specified
	/// stack frame index. The store instruction is to be added to the given
	/// machine basic block before the specified machine instruction. If isKill
	/// is true, the register operand is the last use and must be marked kill.
	virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	llvm_unreachable("Target didn't implement "
	"TargetInstrInfo::storeRegToStackSlot!");
	}

	/// Load the specified register of the given register class from the specified
	/// stack frame index. The load instruction is to be added to the given
	/// machine basic block before the specified machine instruction.
	virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	llvm_unreachable("Target didn't implement "
	"TargetInstrInfo::loadRegFromStackSlot!");
	}

	/// This function is called for all pseudo instructions
	/// that remain after register allocation. Many pseudo instructions are
	/// created to help register allocation. This is the place to convert them
	/// into real instructions. The target can edit MI in place, or it can insert
	/// new instructions and erase MI. The function should return true if
	/// anything was changed.
	virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }

	/// Check whether the target can fold a load that feeds a subreg operand
	/// (or a subreg operand that feeds a store).
	/// For example, X86 may want to return true if it can fold
	/// movl (%esp), %eax
	/// subb, %al, ...
	/// Into:
	/// subb (%esp), ...
	///
	/// Ideally, we'd like the target implementation of foldMemoryOperand() to
	/// reject subregs - but since this behavior used to be enforced in the
	/// target-independent code, moving this responsibility to the targets
	/// has the potential of causing nasty silent breakage in out-of-tree targets.
	virtual bool isSubregFoldable() const { return false; }

	/// Attempt to fold a load or store of the specified stack
	/// slot into the specified machine instruction for the specified operand(s).
	/// If this is possible, a new instruction is returned with the specified
	/// operand folded, otherwise NULL is returned.
	/// The new instruction is inserted before MI, and the client is responsible
	/// for removing the old instruction.
	MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
	int FrameIndex,
	LiveIntervals *LIS = nullptr) const;

	/// Same as the previous version except it allows folding of any load and
	/// store from / to any address, not just from a specific stack slot.
	MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineInstr &LoadMI,
	LiveIntervals *LIS = nullptr) const;

	/// Return true when there is potentially a faster code sequence
	/// for an instruction chain ending in \p Root. All potential patterns are
	/// returned in the \p Pattern vector. Pattern should be sorted in priority
	/// order since the pattern evaluator stops checking as soon as it finds a
	/// faster sequence.
	/// \param Root - Instruction that could be combined with one of its operands
	/// \param Patterns - Vector of possible combination patterns
	virtual bool getMachineCombinerPatterns(
	MachineInstr &Root,
	SmallVectorImpl<MachineCombinerPattern> &Patterns) const;

	/// Return true when a code sequence can improve throughput. It
	/// should be called only for instructions in loops.
	/// \param Pattern - combiner pattern
	virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;

	/// Return true if the input \P Inst is part of a chain of dependent ops
	/// that are suitable for reassociation, otherwise return false.
	/// If the instruction's operands must be commuted to have a previous
	/// instruction of the same type define the first source operand, \P Commuted
	/// will be set to true.
	bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) const;

	/// Return true when \P Inst is both associative and commutative.
	virtual bool isAssociativeAndCommutative(const MachineInstr &Inst) const {
	return false;
	}

	/// Return true when \P Inst has reassociable operands in the same \P MBB.
	virtual bool hasReassociableOperands(const MachineInstr &Inst,
	const MachineBasicBlock *MBB) const;

	/// Return true when \P Inst has reassociable sibling.
	bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const;

	/// When getMachineCombinerPatterns() finds patterns, this function generates
	/// the instructions that could replace the original code sequence. The client
	/// has to decide whether the actual replacement is beneficial or not.
	/// \param Root - Instruction that could be combined with one of its operands
	/// \param Pattern - Combination pattern for Root
	/// \param InsInstrs - Vector of new instructions that implement P
	/// \param DelInstrs - Old instructions, including Root, that could be
	/// replaced by InsInstr
	/// \param InstrIdxForVirtReg - map of virtual register to instruction in
	/// InsInstr that defines it
	virtual void genAlternativeCodeSequence(
	MachineInstr &Root, MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;

	/// Attempt to reassociate \P Root and \P Prev according to \P Pattern to
	/// reduce critical path length.
	void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
	MachineCombinerPattern Pattern,
	SmallVectorImpl<MachineInstr *> &InsInstrs,
	SmallVectorImpl<MachineInstr *> &DelInstrs,
	DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;

	/// This is an architecture-specific helper function of reassociateOps.
	/// Set special operand attributes for new instructions after reassociation.
	virtual void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
	MachineInstr &NewMI1,
	MachineInstr &NewMI2) const {
	}

	/// Return true when a target supports MachineCombiner.
	virtual bool useMachineCombiner() const { return false; }

	protected:
	/// Target-dependent implementation for foldMemoryOperand.
	/// Target-independent code in foldMemoryOperand will
	/// take care of adding a MachineMemOperand to the newly created instruction.
	/// The instruction and any auxiliary instructions necessary will be inserted
	/// at InsertPt.
	virtual MachineInstr *
	foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
	ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, int FrameIndex,
	LiveIntervals *LIS = nullptr) const {
	return nullptr;
	}

	/// Target-dependent implementation for foldMemoryOperand.
	/// Target-independent code in foldMemoryOperand will
	/// take care of adding a MachineMemOperand to the newly created instruction.
	/// The instruction and any auxiliary instructions necessary will be inserted
	/// at InsertPt.
	virtual MachineInstr *foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
	LiveIntervals *LIS = nullptr) const {
	return nullptr;
	}

	/// \brief Target-dependent implementation of getRegSequenceInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// REG_SEQUENCE inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isRegSequenceLike().
	///
	/// \see TargetInstrInfo::getRegSequenceInputs.
	virtual bool getRegSequenceLikeInputs(
	const MachineInstr &MI, unsigned DefIdx,
	SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
	return false;
	}

	/// \brief Target-dependent implementation of getExtractSubregInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// EXTRACT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isExtractSubregLike().
	///
	/// \see TargetInstrInfo::getExtractSubregInputs.
	virtual bool getExtractSubregLikeInputs(
	const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPairAndIdx &InputReg) const {
	return false;
	}

	/// \brief Target-dependent implementation of getInsertSubregInputs.
	///
	/// \returns true if it is possible to build the equivalent
	/// INSERT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
	///
	/// \pre MI.isInsertSubregLike().
	///
	/// \see TargetInstrInfo::getInsertSubregInputs.
	virtual bool
	getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
	RegSubRegPair &BaseReg,
	RegSubRegPairAndIdx &InsertedReg) const {
	return false;
	}

	public:
	/// unfoldMemoryOperand - Separate a single instruction which folded a load or
	/// a store or a load and a store into two or more instruction. If this is
	/// possible, returns true as well as the new instructions by reference.
	virtual bool
	unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
	bool UnfoldLoad, bool UnfoldStore,
	SmallVectorImpl<MachineInstr *> &NewMIs) const {
	return false;
	}

	virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
	SmallVectorImpl<SDNode*> &NewNodes) const {
	return false;
	}

	/// Returns the opcode of the would be new
	/// instruction after load / store are unfolded from an instruction of the
	/// specified opcode. It returns zero if the specified unfolding is not
	/// possible. If LoadRegIndex is non-null, it is filled in with the operand
	/// index of the operand which will hold the register holding the loaded
	/// value.
	virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
	bool UnfoldLoad, bool UnfoldStore,
	unsigned *LoadRegIndex = nullptr) const {
	return 0;
	}

	/// This is used by the pre-regalloc scheduler to determine if two loads are
	/// loading from the same base address. It should only return true if the base
	/// pointers are the same and the only differences between the two addresses
	/// are the offset. It also returns the offsets by reference.
	virtual bool areLoadsFromSameBasePtr(SDNode Load1, SDNode Load2,
	int64_t &Offset1, int64_t &Offset2) const {
	return false;
	}

	/// This is a used by the pre-regalloc scheduler to determine (in conjunction
	/// with areLoadsFromSameBasePtr) if two loads should be scheduled together.
	/// On some targets if two loads are loading from
	/// addresses in the same cache line, it's better if they are scheduled
	/// together. This function takes two integers that represent the load offsets
	/// from the common base address. It returns true if it decides it's desirable
	/// to schedule the two loads together. "NumLoads" is the number of loads that
	/// have already been scheduled after Load1.
	virtual bool shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
	int64_t Offset1, int64_t Offset2,
	unsigned NumLoads) const {
	return false;
	}

	/// Get the base register and byte offset of an instruction that reads/writes
	/// memory.
	virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const {
	return false;
	}

	/// Return true if the instruction contains a base register and offset. If
	/// true, the function also sets the operand position in the instruction
	/// for the base register and offset.
	virtual bool getBaseAndOffsetPosition(const MachineInstr &MI,
	unsigned &BasePos,
	unsigned &OffsetPos) const {
	return false;
	}

	/// If the instruction is an increment of a constant value, return the amount.
	virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const {
	return false;
	}

	/// Returns true if the two given memory operations should be scheduled
	/// adjacent. Note that you have to add:
	/// DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
	/// or
	/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
	/// to TargetPassConfig::createMachineScheduler() to have an effect.
	virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt,
	MachineInstr &SecondLdSt,
	unsigned NumLoads) const {
	llvm_unreachable("target did not implement shouldClusterMemOps()");
	}

	/// Can this target fuse the given instructions if they are scheduled
	/// adjacent. Note that you have to add:
	/// DAG.addMutation(createMacroFusionDAGMutation());
	/// to TargetPassConfig::createMachineScheduler() to have an effect.
	virtual bool shouldScheduleAdjacent(const MachineInstr &First,
	const MachineInstr &Second) const {
	llvm_unreachable("target did not implement shouldScheduleAdjacent()");
	}

	/// Reverses the branch condition of the specified condition list,
	/// returning false on success and true if it cannot be reversed.
	virtual
	bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
	return true;
	}

	/// Insert a noop into the instruction stream at the specified point.
	virtual void insertNoop(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI) const;


	/// Return the noop instruction to use for a noop.
	virtual void getNoopForMachoTarget(MCInst &NopInst) const;

	/// Return true for post-incremented instructions.
	virtual bool isPostIncrement(const MachineInstr &MI) const {
	return false;
	}

	/// Returns true if the instruction is already predicated.
	virtual bool isPredicated(const MachineInstr &MI) const {
	return false;
	}

	/// Returns true if the instruction is a
	/// terminator instruction that has not been predicated.
	virtual bool isUnpredicatedTerminator(const MachineInstr &MI) const;

	- /// Returns true if MI is an unconditional tail call.
	- virtual bool isUnconditionalTailCall(const MachineInstr &MI) const {
	- return false;
	- }
	-
	- /// Returns true if the tail call can be made conditional on BranchCond.
	- virtual bool
	- canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
	- const MachineInstr &TailCall) const {
	- return false;
	- }
	-
	- /// Replace the conditional branch in MBB with a conditional tail call.
	- virtual void replaceBranchWithTailCall(MachineBasicBlock &MBB,
	- SmallVectorImpl<MachineOperand> &Cond,
	- const MachineInstr &TailCall) const {
	- llvm_unreachable("Target didn't implement replaceBranchWithTailCall!");
	- }
	-
	/// Convert the instruction into a predicated instruction.
	/// It returns true if the operation was successful.
	virtual bool PredicateInstruction(MachineInstr &MI,
	ArrayRef<MachineOperand> Pred) const;

	/// Returns true if the first specified predicate
	/// subsumes the second, e.g. GE subsumes GT.
	virtual
	bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
	ArrayRef<MachineOperand> Pred2) const {
	return false;
	}

	/// If the specified instruction defines any predicate
	/// or condition code register(s) used for predication, returns true as well
	/// as the definition predicate(s) by reference.
	virtual bool DefinesPredicate(MachineInstr &MI,
	std::vector<MachineOperand> &Pred) const {
	return false;
	}

	/// Return true if the specified instruction can be predicated.
	/// By default, this returns true for every instruction with a
	/// PredicateOperand.
	virtual bool isPredicable(MachineInstr &MI) const {
	return MI.getDesc().isPredicable();
	}

	/// Return true if it's safe to move a machine
	/// instruction that defines the specified register class.
	virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
	return true;
	}

	/// Test if the given instruction should be considered a scheduling boundary.
	/// This primarily includes labels and terminators.
	virtual bool isSchedulingBoundary(const MachineInstr &MI,
	const MachineBasicBlock *MBB,
	const MachineFunction &MF) const;

	/// Measure the specified inline asm to determine an approximation of its
	/// length.
	virtual unsigned getInlineAsmLength(const char *Str,
	const MCAsmInfo &MAI) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions before register allocation.
	virtual ScheduleHazardRecognizer*
	CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions before register allocation.
	virtual ScheduleHazardRecognizer*
	CreateTargetMIHazardRecognizer(const InstrItineraryData*,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for this target when
	/// scheduling the machine instructions after register allocation.
	virtual ScheduleHazardRecognizer*
	CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
	const ScheduleDAG *DAG) const;

	/// Allocate and return a hazard recognizer to use for by non-scheduling
	/// passes.
	virtual ScheduleHazardRecognizer*
	CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
	return nullptr;
	}

	/// Provide a global flag for disabling the PreRA hazard recognizer that
	/// targets may choose to honor.
	bool usePreRAHazardRecognizer() const;

	/// For a comparison instruction, return the source registers
	/// in SrcReg and SrcReg2 if having two register operands, and the value it
	/// compares against in CmpValue. Return true if the comparison instruction
	/// can be analyzed.
	virtual bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &Mask, int &Value) const {
	return false;
	}

	/// See if the comparison instruction can be converted
	/// into something more efficient. E.g., on ARM most instructions can set the
	/// flags register, obviating the need for a separate CMP.
	virtual bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
	unsigned SrcReg2, int Mask, int Value,
	const MachineRegisterInfo *MRI) const {
	return false;
	}
	virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; }

	/// Try to remove the load by folding it to a register operand at the use.
	/// We fold the load instructions if and only if the
	/// def and use are in the same BB. We only look at one load and see
	/// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
	/// defined by the load we are trying to fold. DefMI returns the machine
	/// instruction that defines FoldAsLoadDefReg, and the function returns
	/// the machine instruction generated due to folding.
	virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
	const MachineRegisterInfo *MRI,
	unsigned &FoldAsLoadDefReg,
	MachineInstr *&DefMI) const {
	return nullptr;
	}

	/// 'Reg' is known to be defined by a move immediate instruction,
	/// try to fold the immediate into the use instruction.
	/// If MRI->hasOneNonDBGUse(Reg) is true, and this function returns true,
	/// then the caller may assume that DefMI has been erased from its parent
	/// block. The caller may assume that it will not be erased by this
	/// function otherwise.
	virtual bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
	unsigned Reg, MachineRegisterInfo *MRI) const {
	return false;
	}

	/// Return the number of u-operations the given machine
	/// instruction will be decoded to on the target cpu. The itinerary's
	/// IssueWidth is the number of microops that can be dispatched each
	/// cycle. An instruction with zero microops takes no dispatch resources.
	virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
	const MachineInstr &MI) const;

	/// Return true for pseudo instructions that don't consume any
	/// machine resources in their current form. These are common cases that the
	/// scheduler should consider free, rather than conservatively handling them
	/// as instructions with no itinerary.
	bool isZeroCost(unsigned Opcode) const {
	return Opcode <= TargetOpcode::COPY;
	}

	virtual int getOperandLatency(const InstrItineraryData *ItinData,
	SDNode *DefNode, unsigned DefIdx,
	SDNode *UseNode, unsigned UseIdx) const;

	/// Compute and return the use operand latency of a given pair of def and use.
	/// In most cases, the static scheduling itinerary was enough to determine the
	/// operand latency. But it may not be possible for instructions with variable
	/// number of defs / uses.
	///
	/// This is a raw interface to the itinerary that may be directly overridden
	/// by a target. Use computeOperandLatency to get the best estimate of
	/// latency.
	virtual int getOperandLatency(const InstrItineraryData *ItinData,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const;

	/// Compute the instruction latency of a given instruction.
	/// If the instruction has higher cost when predicated, it's returned via
	/// PredCost.
	virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
	const MachineInstr &MI,
	unsigned *PredCost = nullptr) const;

	virtual unsigned getPredicationCost(const MachineInstr &MI) const;

	virtual int getInstrLatency(const InstrItineraryData *ItinData,
	SDNode *Node) const;

	/// Return the default expected latency for a def based on its opcode.
	unsigned defaultDefLatency(const MCSchedModel &SchedModel,
	const MachineInstr &DefMI) const;

	int computeDefOperandLatency(const InstrItineraryData *ItinData,
	const MachineInstr &DefMI) const;

	/// Return true if this opcode has high latency to its result.
	virtual bool isHighLatencyDef(int opc) const { return false; }

	/// Compute operand latency between a def of 'Reg'
	/// and a use in the current loop. Return true if the target considered
	/// it 'high'. This is used by optimization passes such as machine LICM to
	/// determine whether it makes sense to hoist an instruction out even in a
	/// high register pressure situation.
	virtual bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
	const MachineRegisterInfo *MRI,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const {
	return false;
	}

	/// Compute operand latency of a def of 'Reg'. Return true
	/// if the target considered it 'low'.
	virtual bool hasLowDefLatency(const TargetSchedModel &SchedModel,
	const MachineInstr &DefMI,
	unsigned DefIdx) const;

	/// Perform target-specific instruction verification.
	virtual bool verifyInstruction(const MachineInstr &MI,
	StringRef &ErrInfo) const {
	return true;
	}

	/// Return the current execution domain and bit mask of
	/// possible domains for instruction.
	///
	/// Some micro-architectures have multiple execution domains, and multiple
	/// opcodes that perform the same operation in different domains. For
	/// example, the x86 architecture provides the por, orps, and orpd
	/// instructions that all do the same thing. There is a latency penalty if a
	/// register is written in one domain and read in another.
	///
	/// This function returns a pair (domain, mask) containing the execution
	/// domain of MI, and a bit mask of possible domains. The setExecutionDomain
	/// function can be used to change the opcode to one of the domains in the
	/// bit mask. Instructions whose execution domain can't be changed should
	/// return a 0 mask.
	///
	/// The execution domain numbers don't have any special meaning except domain
	/// 0 is used for instructions that are not associated with any interesting
	/// execution domain.
	///
	virtual std::pair<uint16_t, uint16_t>
	getExecutionDomain(const MachineInstr &MI) const {
	return std::make_pair(0, 0);
	}

	/// Change the opcode of MI to execute in Domain.
	///
	/// The bit (1 << Domain) must be set in the mask returned from
	/// getExecutionDomain(MI).
	virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {}

	/// Returns the preferred minimum clearance
	/// before an instruction with an unwanted partial register update.
	///
	/// Some instructions only write part of a register, and implicitly need to
	/// read the other parts of the register. This may cause unwanted stalls
	/// preventing otherwise unrelated instructions from executing in parallel in
	/// an out-of-order CPU.
	///
	/// For example, the x86 instruction cvtsi2ss writes its result to bits
	/// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
	/// the instruction needs to wait for the old value of the register to become
	/// available:
	///
	/// addps %xmm1, %xmm0
	/// movaps %xmm0, (%rax)
	/// cvtsi2ss %rbx, %xmm0
	///
	/// In the code above, the cvtsi2ss instruction needs to wait for the addps
	/// instruction before it can issue, even though the high bits of %xmm0
	/// probably aren't needed.
	///
	/// This hook returns the preferred clearance before MI, measured in
	/// instructions. Other defs of MI's operand OpNum are avoided in the last N
	/// instructions before MI. It should only return a positive value for
	/// unwanted dependencies. If the old bits of the defined register have
	/// useful values, or if MI is determined to otherwise read the dependency,
	/// the hook should return 0.
	///
	/// The unwanted dependency may be handled by:
	///
	/// 1. Allocating the same register for an MI def and use. That makes the
	/// unwanted dependency identical to a required dependency.
	///
	/// 2. Allocating a register for the def that has no defs in the previous N
	/// instructions.
	///
	/// 3. Calling breakPartialRegDependency() with the same arguments. This
	/// allows the target to insert a dependency breaking instruction.
	///
	virtual unsigned
	getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const {
	// The default implementation returns 0 for no partial register dependency.
	return 0;
	}

	/// \brief Return the minimum clearance before an instruction that reads an
	/// unused register.
	///
	/// For example, AVX instructions may copy part of a register operand into
	/// the unused high bits of the destination register.
	///
	/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
	///
	/// In the code above, vcvtsi2sdq copies %xmm0[127:64] into %xmm14 creating a
	/// false dependence on any previous write to %xmm0.
	///
	/// This hook works similarly to getPartialRegUpdateClearance, except that it
	/// does not take an operand index. Instead sets \p OpNum to the index of the
	/// unused register.
	virtual unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
	const TargetRegisterInfo *TRI) const {
	// The default implementation returns 0 for no undef register dependency.
	return 0;
	}

	/// Insert a dependency-breaking instruction
	/// before MI to eliminate an unwanted dependency on OpNum.
	///
	/// If it wasn't possible to avoid a def in the last N instructions before MI
	/// (see getPartialRegUpdateClearance), this hook will be called to break the
	/// unwanted dependency.
	///
	/// On x86, an xorps instruction can be used as a dependency breaker:
	///
	/// addps %xmm1, %xmm0
	/// movaps %xmm0, (%rax)
	/// xorps %xmm0, %xmm0
	/// cvtsi2ss %rbx, %xmm0
	///
	/// An <imp-kill> operand should be added to MI if an instruction was
	/// inserted. This ties the instructions together in the post-ra scheduler.
	///
	virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const {}

	/// Create machine specific model for scheduling.
	virtual DFAPacketizer *
	CreateTargetScheduleState(const TargetSubtargetInfo &) const {
	return nullptr;
	}

	// Sometimes, it is possible for the target
	// to tell, even without aliasing information, that two MIs access different
	// memory addresses. This function returns true if two MIs access different
	// memory addresses and false otherwise.
	virtual bool
	areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
	AliasAnalysis *AA = nullptr) const {
	assert((MIa.mayLoad() \|\| MIa.mayStore()) &&
	"MIa must load from or modify a memory location");
	assert((MIb.mayLoad() \|\| MIb.mayStore()) &&
	"MIb must load from or modify a memory location");
	return false;
	}

	/// \brief Return the value to use for the MachineCSE's LookAheadLimit,
	/// which is a heuristic used for CSE'ing phys reg defs.
	virtual unsigned getMachineCSELookAheadLimit () const {
	// The default lookahead is small to prevent unprofitable quadratic
	// behavior.
	return 5;
	}

	/// Return an array that contains the ids of the target indices (used for the
	/// TargetIndex machine operand) and their names.
	///
	/// MIR Serialization is able to serialize only the target indices that are
	/// defined by this method.
	virtual ArrayRef<std::pair<int, const char *>>
	getSerializableTargetIndices() const {
	return None;
	}

	/// Decompose the machine operand's target flags into two values - the direct
	/// target flag value and any of bit flags that are applied.
	virtual std::pair<unsigned, unsigned>
	decomposeMachineOperandsTargetFlags(unsigned /TF/) const {
	return std::make_pair(0u, 0u);
	}

	/// Return an array that contains the direct target flag values and their
	/// names.
	///
	/// MIR Serialization is able to serialize only the target flags that are
	/// defined by this method.
	virtual ArrayRef<std::pair<unsigned, const char *>>
	getSerializableDirectMachineOperandTargetFlags() const {
	return None;
	}

	/// Return an array that contains the bitmask target flag values and their
	/// names.
	///
	/// MIR Serialization is able to serialize only the target flags that are
	/// defined by this method.
	virtual ArrayRef<std::pair<unsigned, const char *>>
	getSerializableBitmaskMachineOperandTargetFlags() const {
	return None;
	}

	/// Determines whether \|Inst\| is a tail call instruction.
	virtual bool isTailCall(const MachineInstr &Inst) const {
	return false;
	}

	private:
	unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
	unsigned CatchRetOpcode;
	unsigned ReturnOpcode;
	};

	/// \brief Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
	template<>
	struct DenseMapInfo<TargetInstrInfo::RegSubRegPair> {
	typedef DenseMapInfo<unsigned> RegInfo;

	static inline TargetInstrInfo::RegSubRegPair getEmptyKey() {
	return TargetInstrInfo::RegSubRegPair(RegInfo::getEmptyKey(),
	RegInfo::getEmptyKey());
	}
	static inline TargetInstrInfo::RegSubRegPair getTombstoneKey() {
	return TargetInstrInfo::RegSubRegPair(RegInfo::getTombstoneKey(),
	RegInfo::getTombstoneKey());
	}
	/// \brief Reuse getHashValue implementation from
	/// std::pair<unsigned, unsigned>.
	static unsigned getHashValue(const TargetInstrInfo::RegSubRegPair &Val) {
	std::pair<unsigned, unsigned> PairVal =
	std::make_pair(Val.Reg, Val.SubReg);
	return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
	}
	static bool isEqual(const TargetInstrInfo::RegSubRegPair &LHS,
	const TargetInstrInfo::RegSubRegPair &RHS) {
	return RegInfo::isEqual(LHS.Reg, RHS.Reg) &&
	RegInfo::isEqual(LHS.SubReg, RHS.SubReg);
	}
	};

	} // end namespace llvm

	#endif // LLVM_TARGET_TARGETINSTRINFO_H
	Index: projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp (revision 313643)
	@@ -1,1804 +1,1850 @@
	//===- MetadataLoader.cpp - Internal BitcodeReader implementation ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "MetadataLoader.h"
	#include "ValueList.h"

	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Bitcode/BitcodeReader.h"
	#include "llvm/Bitcode/BitstreamReader.h"
	#include "llvm/Bitcode/LLVMBitCodes.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/AutoUpgrade.h"
	#include "llvm/IR/BasicBlock.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Comdat.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/DiagnosticPrinter.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GVMaterializer.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalIFunc.h"
	#include "llvm/IR/GlobalIndirectSymbol.h"
	#include "llvm/IR/GlobalObject.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/ModuleSummaryIndex.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/TrackingMDRef.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/ValueHandle.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MemoryBuffer.h"
	#include "llvm/Support/raw_ostream.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <deque>
	#include <limits>
	#include <map>
	#include <memory>
	#include <string>
	#include <system_error>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "bitcode-reader"

	STATISTIC(NumMDStringLoaded, "Number of MDStrings loaded");
	STATISTIC(NumMDNodeTemporary, "Number of MDNode::Temporary created");
	STATISTIC(NumMDRecordLoaded, "Number of Metadata records loaded");

	/// Flag whether we need to import full type definitions for ThinLTO.
	/// Currently needed for Darwin and LLDB.
	static cl::opt<bool> ImportFullTypeDefinitions(
	"import-full-type-definitions", cl::init(false), cl::Hidden,
	cl::desc("Import full type definitions for ThinLTO."));

	static cl::opt<bool> DisableLazyLoading(
	"disable-ondemand-mds-loading", cl::init(false), cl::Hidden,
	cl::desc("Force disable the lazy-loading on-demand of metadata when "
	"loading bitcode for importing."));

	namespace {

	static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }

	class BitcodeReaderMetadataList {
	/// Array of metadata references.
	///
	/// Don't use std::vector here. Some versions of libc++ copy (instead of
	/// move) on resize, and TrackingMDRef is very expensive to copy.
	SmallVector<TrackingMDRef, 1> MetadataPtrs;

	/// The set of indices in MetadataPtrs above of forward references that were
	/// generated.
	SmallDenseSet<unsigned, 1> ForwardReference;

	/// The set of indices in MetadataPtrs above of Metadata that need to be
	/// resolved.
	SmallDenseSet<unsigned, 1> UnresolvedNodes;

	/// Structures for resolving old type refs.
	struct {
	SmallDenseMap<MDString *, TempMDTuple, 1> Unknown;
	SmallDenseMap<MDString , DICompositeType , 1> Final;
	SmallDenseMap<MDString , DICompositeType , 1> FwdDecls;
	SmallVector<std::pair<TrackingMDRef, TempMDTuple>, 1> Arrays;
	} OldTypeRefs;

	LLVMContext &Context;

	public:
	BitcodeReaderMetadataList(LLVMContext &C) : Context(C) {}

	// vector compatibility methods
	unsigned size() const { return MetadataPtrs.size(); }
	void resize(unsigned N) { MetadataPtrs.resize(N); }
	void push_back(Metadata *MD) { MetadataPtrs.emplace_back(MD); }
	void clear() { MetadataPtrs.clear(); }
	Metadata *back() const { return MetadataPtrs.back(); }
	void pop_back() { MetadataPtrs.pop_back(); }
	bool empty() const { return MetadataPtrs.empty(); }

	Metadata *operator[](unsigned i) const {
	assert(i < MetadataPtrs.size());
	return MetadataPtrs[i];
	}

	Metadata *lookup(unsigned I) const {
	if (I < MetadataPtrs.size())
	return MetadataPtrs[I];
	return nullptr;
	}

	void shrinkTo(unsigned N) {
	assert(N <= size() && "Invalid shrinkTo request!");
	assert(ForwardReference.empty() && "Unexpected forward refs");
	assert(UnresolvedNodes.empty() && "Unexpected unresolved node");
	MetadataPtrs.resize(N);
	}

	/// Return the given metadata, creating a replaceable forward reference if
	/// necessary.
	Metadata *getMetadataFwdRef(unsigned Idx);

	/// Return the the given metadata only if it is fully resolved.
	///
	/// Gives the same result as \a lookup(), unless \a MDNode::isResolved()
	/// would give \c false.
	Metadata *getMetadataIfResolved(unsigned Idx);

	MDNode *getMDNodeFwdRefOrNull(unsigned Idx);
	void assignValue(Metadata *MD, unsigned Idx);
	void tryToResolveCycles();
	bool hasFwdRefs() const { return !ForwardReference.empty(); }
	int getNextFwdRef() {
	assert(hasFwdRefs());
	return *ForwardReference.begin();
	}

	/// Upgrade a type that had an MDString reference.
	void addTypeRef(MDString &UUID, DICompositeType &CT);

	/// Upgrade a type that had an MDString reference.
	Metadata upgradeTypeRef(Metadata MaybeUUID);

	/// Upgrade a type ref array that may have MDString references.
	Metadata upgradeTypeRefArray(Metadata MaybeTuple);

	private:
	Metadata resolveTypeRefArray(Metadata MaybeTuple);
	};

	void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) {
	if (auto *MDN = dyn_cast<MDNode>(MD))
	if (!MDN->isResolved())
	UnresolvedNodes.insert(Idx);

	if (Idx == size()) {
	push_back(MD);
	return;
	}

	if (Idx >= size())
	resize(Idx + 1);

	TrackingMDRef &OldMD = MetadataPtrs[Idx];
	if (!OldMD) {
	OldMD.reset(MD);
	return;
	}

	// If there was a forward reference to this value, replace it.
	TempMDTuple PrevMD(cast<MDTuple>(OldMD.get()));
	PrevMD->replaceAllUsesWith(MD);
	ForwardReference.erase(Idx);
	}

	Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) {
	if (Idx >= size())
	resize(Idx + 1);

	if (Metadata *MD = MetadataPtrs[Idx])
	return MD;

	// Track forward refs to be resolved later.
	ForwardReference.insert(Idx);

	// Create and return a placeholder, which will later be RAUW'd.
	++NumMDNodeTemporary;
	Metadata *MD = MDNode::getTemporary(Context, None).release();
	MetadataPtrs[Idx].reset(MD);
	return MD;
	}

	Metadata *BitcodeReaderMetadataList::getMetadataIfResolved(unsigned Idx) {
	Metadata *MD = lookup(Idx);
	if (auto *N = dyn_cast_or_null<MDNode>(MD))
	if (!N->isResolved())
	return nullptr;
	return MD;
	}

	MDNode *BitcodeReaderMetadataList::getMDNodeFwdRefOrNull(unsigned Idx) {
	return dyn_cast_or_null<MDNode>(getMetadataFwdRef(Idx));
	}

	void BitcodeReaderMetadataList::tryToResolveCycles() {
	if (!ForwardReference.empty())
	// Still forward references... can't resolve cycles.
	return;

	// Give up on finding a full definition for any forward decls that remain.
	for (const auto &Ref : OldTypeRefs.FwdDecls)
	OldTypeRefs.Final.insert(Ref);
	OldTypeRefs.FwdDecls.clear();

	// Upgrade from old type ref arrays. In strange cases, this could add to
	// OldTypeRefs.Unknown.
	for (const auto &Array : OldTypeRefs.Arrays)
	Array.second->replaceAllUsesWith(resolveTypeRefArray(Array.first.get()));
	OldTypeRefs.Arrays.clear();

	// Replace old string-based type refs with the resolved node, if possible.
	// If we haven't seen the node, leave it to the verifier to complain about
	// the invalid string reference.
	for (const auto &Ref : OldTypeRefs.Unknown) {
	if (DICompositeType *CT = OldTypeRefs.Final.lookup(Ref.first))
	Ref.second->replaceAllUsesWith(CT);
	else
	Ref.second->replaceAllUsesWith(Ref.first);
	}
	OldTypeRefs.Unknown.clear();

	if (UnresolvedNodes.empty())
	// Nothing to do.
	return;

	// Resolve any cycles.
	for (unsigned I : UnresolvedNodes) {
	auto &MD = MetadataPtrs[I];
	auto *N = dyn_cast_or_null<MDNode>(MD);
	if (!N)
	continue;

	assert(!N->isTemporary() && "Unexpected forward reference");
	N->resolveCycles();
	}

	// Make sure we return early again until there's another unresolved ref.
	UnresolvedNodes.clear();
	}

	void BitcodeReaderMetadataList::addTypeRef(MDString &UUID,
	DICompositeType &CT) {
	assert(CT.getRawIdentifier() == &UUID && "Mismatched UUID");
	if (CT.isForwardDecl())
	OldTypeRefs.FwdDecls.insert(std::make_pair(&UUID, &CT));
	else
	OldTypeRefs.Final.insert(std::make_pair(&UUID, &CT));
	}

	Metadata BitcodeReaderMetadataList::upgradeTypeRef(Metadata MaybeUUID) {
	auto *UUID = dyn_cast_or_null<MDString>(MaybeUUID);
	if (LLVM_LIKELY(!UUID))
	return MaybeUUID;

	if (auto *CT = OldTypeRefs.Final.lookup(UUID))
	return CT;

	auto &Ref = OldTypeRefs.Unknown[UUID];
	if (!Ref)
	Ref = MDNode::getTemporary(Context, None);
	return Ref.get();
	}

	Metadata BitcodeReaderMetadataList::upgradeTypeRefArray(Metadata MaybeTuple) {
	auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
	if (!Tuple \|\| Tuple->isDistinct())
	return MaybeTuple;

	// Look through the array immediately if possible.
	if (!Tuple->isTemporary())
	return resolveTypeRefArray(Tuple);

	// Create and return a placeholder to use for now. Eventually
	// resolveTypeRefArrays() will be resolve this forward reference.
	OldTypeRefs.Arrays.emplace_back(
	std::piecewise_construct, std::forward_as_tuple(Tuple),
	std::forward_as_tuple(MDTuple::getTemporary(Context, None)));
	return OldTypeRefs.Arrays.back().second.get();
	}

	Metadata BitcodeReaderMetadataList::resolveTypeRefArray(Metadata MaybeTuple) {
	auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
	if (!Tuple \|\| Tuple->isDistinct())
	return MaybeTuple;

	// Look through the DITypeRefArray, upgrading each DITypeRef.
	SmallVector<Metadata *, 32> Ops;
	Ops.reserve(Tuple->getNumOperands());
	for (Metadata *MD : Tuple->operands())
	Ops.push_back(upgradeTypeRef(MD));

	return MDTuple::get(Context, Ops);
	}

	namespace {

	class PlaceholderQueue {
	// Placeholders would thrash around when moved, so store in a std::deque
	// instead of some sort of vector.
	std::deque<DistinctMDOperandPlaceholder> PHs;

	public:
	bool empty() { return PHs.empty(); }
	DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
	void flush(BitcodeReaderMetadataList &MetadataList);

	/// Return the list of temporaries nodes in the queue, these need to be
	/// loaded before we can flush the queue.
	void getTemporaries(BitcodeReaderMetadataList &MetadataList,
	DenseSet<unsigned> &Temporaries) {
	for (auto &PH : PHs) {
	auto ID = PH.getID();
	auto *MD = MetadataList.lookup(ID);
	if (!MD) {
	Temporaries.insert(ID);
	continue;
	}
	auto *N = dyn_cast_or_null<MDNode>(MD);
	if (N && N->isTemporary())
	Temporaries.insert(ID);
	}
	}
	};

	} // end anonymous namespace

	DistinctMDOperandPlaceholder &PlaceholderQueue::getPlaceholderOp(unsigned ID) {
	PHs.emplace_back(ID);
	return PHs.back();
	}

	void PlaceholderQueue::flush(BitcodeReaderMetadataList &MetadataList) {
	while (!PHs.empty()) {
	auto *MD = MetadataList.lookup(PHs.front().getID());
	assert(MD && "Flushing placeholder on unassigned MD");
	#ifndef NDEBUG
	if (auto *MDN = dyn_cast<MDNode>(MD))
	assert(MDN->isResolved() &&
	"Flushing Placeholder while cycles aren't resolved");
	#endif
	PHs.front().replaceUseWith(MD);
	PHs.pop_front();
	}
	}

	} // anonynous namespace

	class MetadataLoader::MetadataLoaderImpl {
	BitcodeReaderMetadataList MetadataList;
	BitcodeReaderValueList &ValueList;
	BitstreamCursor &Stream;
	LLVMContext &Context;
	Module &TheModule;
	std::function<Type *(unsigned)> getTypeByID;

	/// Cursor associated with the lazy-loading of Metadata. This is the easy way
	/// to keep around the right "context" (Abbrev list) to be able to jump in
	/// the middle of the metadata block and load any record.
	BitstreamCursor IndexCursor;

	/// Index that keeps track of MDString values.
	std::vector<StringRef> MDStringRef;

	/// On-demand loading of a single MDString. Requires the index above to be
	/// populated.
	MDString *lazyLoadOneMDString(unsigned Idx);

	/// Index that keeps track of where to find a metadata record in the stream.
	std::vector<uint64_t> GlobalMetadataBitPosIndex;

	/// Populate the index above to enable lazily loading of metadata, and load
	/// the named metadata as well as the transitively referenced global
	/// Metadata.
	Expected<bool> lazyLoadModuleMetadataBlock();

	/// On-demand loading of a single metadata. Requires the index above to be
	/// populated.
	void lazyLoadOneMetadata(unsigned Idx, PlaceholderQueue &Placeholders);

	// Keep mapping of seens pair of old-style CU <-> SP, and update pointers to
	// point from SP to CU after a block is completly parsed.
	std::vector<std::pair<DICompileUnit , Metadata >> CUSubprograms;

	/// Functions that need to be matched with subprograms when upgrading old
	/// metadata.
	SmallDenseMap<Function , DISubprogram , 16> FunctionsWithSPs;

	// Map the bitcode's custom MDKind ID to the Module's MDKind ID.
	DenseMap<unsigned, unsigned> MDKindMap;

	bool StripTBAA = false;
	bool HasSeenOldLoopTags = false;
	+ bool NeedUpgradeToDIGlobalVariableExpression = false;

	/// True if metadata is being parsed for a module being ThinLTO imported.
	bool IsImporting = false;

	Error parseOneMetadata(SmallVectorImpl<uint64_t> &Record, unsigned Code,
	PlaceholderQueue &Placeholders, StringRef Blob,
	unsigned &NextMetadataNo);
	Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
	std::function<void(StringRef)> CallBack);
	Error parseGlobalObjectAttachment(GlobalObject &GO,
	ArrayRef<uint64_t> Record);
	Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);

	void resolveForwardRefsAndPlaceholders(PlaceholderQueue &Placeholders);

	/// Upgrade old-style CU <-> SP pointers to point from SP to CU.
	void upgradeCUSubprograms() {
	for (auto CU_SP : CUSubprograms)
	if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
	for (auto &Op : SPs->operands())
	if (auto *SP = dyn_cast_or_null<MDNode>(Op))
	SP->replaceOperandWith(7, CU_SP.first);
	CUSubprograms.clear();
	}

	+ /// Upgrade old-style bare DIGlobalVariables to DIGlobalVariableExpressions.
	+ void upgradeCUVariables() {
	+ if (!NeedUpgradeToDIGlobalVariableExpression)
	+ return;
	+
	+ // Upgrade list of variables attached to the CUs.
	+ if (NamedMDNode *CUNodes = TheModule.getNamedMetadata("llvm.dbg.cu"))
	+ for (unsigned I = 0, E = CUNodes->getNumOperands(); I != E; ++I) {
	+ auto *CU = cast<DICompileUnit>(CUNodes->getOperand(I));
	+ if (auto *GVs = dyn_cast_or_null<MDTuple>(CU->getRawGlobalVariables()))
	+ for (unsigned I = 0; I < GVs->getNumOperands(); I++)
	+ if (auto *GV =
	+ dyn_cast_or_null<DIGlobalVariable>(GVs->getOperand(I))) {
	+ auto *DGVE =
	+ DIGlobalVariableExpression::getDistinct(Context, GV, nullptr);
	+ GVs->replaceOperandWith(I, DGVE);
	+ }
	+ }
	+
	+ // Upgrade variables attached to globals.
	+ for (auto &GV : TheModule.globals()) {
	+ SmallVector<MDNode *, 1> MDs, NewMDs;
	+ GV.getMetadata(LLVMContext::MD_dbg, MDs);
	+ GV.eraseMetadata(LLVMContext::MD_dbg);
	+ for (auto *MD : MDs)
	+ if (auto *DGV = dyn_cast_or_null<DIGlobalVariable>(MD)) {
	+ auto *DGVE =
	+ DIGlobalVariableExpression::getDistinct(Context, DGV, nullptr);
	+ GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
	+ } else
	+ GV.addMetadata(LLVMContext::MD_dbg, *MD);
	+ }
	+ }
	+
	+ void upgradeDebugInfo() {
	+ upgradeCUSubprograms();
	+ upgradeCUVariables();
	+ }
	+
	public:
	MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule,
	BitcodeReaderValueList &ValueList,
	std::function<Type *(unsigned)> getTypeByID,
	bool IsImporting)
	: MetadataList(TheModule.getContext()), ValueList(ValueList),
	Stream(Stream), Context(TheModule.getContext()), TheModule(TheModule),
	getTypeByID(getTypeByID), IsImporting(IsImporting) {}

	Error parseMetadata(bool ModuleLevel);

	bool hasFwdRefs() const { return MetadataList.hasFwdRefs(); }

	Metadata *getMetadataFwdRefOrLoad(unsigned ID) {
	if (ID < MDStringRef.size())
	return lazyLoadOneMDString(ID);
	if (auto *MD = MetadataList.lookup(ID))
	return MD;
	// If lazy-loading is enabled, we try recursively to load the operand
	// instead of creating a temporary.
	if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
	PlaceholderQueue Placeholders;
	lazyLoadOneMetadata(ID, Placeholders);
	resolveForwardRefsAndPlaceholders(Placeholders);
	return MetadataList.lookup(ID);
	}
	return MetadataList.getMetadataFwdRef(ID);
	}

	MDNode *getMDNodeFwdRefOrNull(unsigned Idx) {
	return MetadataList.getMDNodeFwdRefOrNull(Idx);
	}

	DISubprogram lookupSubprogramForFunction(Function F) {
	return FunctionsWithSPs.lookup(F);
	}

	bool hasSeenOldLoopTags() { return HasSeenOldLoopTags; }

	Error parseMetadataAttachment(
	Function &F, const SmallVectorImpl<Instruction *> &InstructionList);

	Error parseMetadataKinds();

	void setStripTBAA(bool Value) { StripTBAA = Value; }
	bool isStrippingTBAA() { return StripTBAA; }

	unsigned size() const { return MetadataList.size(); }
	void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
	};

	Error error(const Twine &Message) {
	return make_error<StringError>(
	Message, make_error_code(BitcodeError::CorruptedBitcode));
	}

	Expected<bool>
	MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
	IndexCursor = Stream;
	SmallVector<uint64_t, 64> Record;
	// Get the abbrevs, and preload record positions to make them lazy-loadable.
	while (true) {
	BitstreamEntry Entry = IndexCursor.advanceSkippingSubblocks(
	BitstreamCursor::AF_DontPopBlockAtEnd);
	switch (Entry.Kind) {
	case BitstreamEntry::SubBlock: // Handled for us already.
	case BitstreamEntry::Error:
	return error("Malformed block");
	case BitstreamEntry::EndBlock: {
	return true;
	}
	case BitstreamEntry::Record: {
	// The interesting case.
	++NumMDRecordLoaded;
	uint64_t CurrentPos = IndexCursor.GetCurrentBitNo();
	auto Code = IndexCursor.skipRecord(Entry.ID);
	switch (Code) {
	case bitc::METADATA_STRINGS: {
	// Rewind and parse the strings.
	IndexCursor.JumpToBit(CurrentPos);
	StringRef Blob;
	Record.clear();
	IndexCursor.readRecord(Entry.ID, Record, &Blob);
	unsigned NumStrings = Record[0];
	MDStringRef.reserve(NumStrings);
	auto IndexNextMDString = [&](StringRef Str) {
	MDStringRef.push_back(Str);
	};
	if (auto Err = parseMetadataStrings(Record, Blob, IndexNextMDString))
	return std::move(Err);
	break;
	}
	case bitc::METADATA_INDEX_OFFSET: {
	// This is the offset to the index, when we see this we skip all the
	// records and load only an index to these.
	IndexCursor.JumpToBit(CurrentPos);
	Record.clear();
	IndexCursor.readRecord(Entry.ID, Record);
	if (Record.size() != 2)
	return error("Invalid record");
	auto Offset = Record[0] + (Record[1] << 32);
	auto BeginPos = IndexCursor.GetCurrentBitNo();
	IndexCursor.JumpToBit(BeginPos + Offset);
	Entry = IndexCursor.advanceSkippingSubblocks(
	BitstreamCursor::AF_DontPopBlockAtEnd);
	assert(Entry.Kind == BitstreamEntry::Record &&
	"Corrupted bitcode: Expected `Record` when trying to find the "
	"Metadata index");
	Record.clear();
	auto Code = IndexCursor.readRecord(Entry.ID, Record);
	(void)Code;
	assert(Code == bitc::METADATA_INDEX && "Corrupted bitcode: Expected "
	"`METADATA_INDEX` when trying "
	"to find the Metadata index");

	// Delta unpack
	auto CurrentValue = BeginPos;
	GlobalMetadataBitPosIndex.reserve(Record.size());
	for (auto &Elt : Record) {
	CurrentValue += Elt;
	GlobalMetadataBitPosIndex.push_back(CurrentValue);
	}
	break;
	}
	case bitc::METADATA_INDEX:
	// We don't expect to get there, the Index is loaded when we encounter
	// the offset.
	return error("Corrupted Metadata block");
	case bitc::METADATA_NAME: {
	// Named metadata need to be materialized now and aren't deferred.
	IndexCursor.JumpToBit(CurrentPos);
	Record.clear();
	unsigned Code = IndexCursor.readRecord(Entry.ID, Record);
	assert(Code == bitc::METADATA_NAME);

	// Read name of the named metadata.
	SmallString<8> Name(Record.begin(), Record.end());
	Code = IndexCursor.ReadCode();

	// Named Metadata comes in two parts, we expect the name to be followed
	// by the node
	Record.clear();
	unsigned NextBitCode = IndexCursor.readRecord(Code, Record);
	assert(NextBitCode == bitc::METADATA_NAMED_NODE);
	(void)NextBitCode;

	// Read named metadata elements.
	unsigned Size = Record.size();
	NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
	for (unsigned i = 0; i != Size; ++i) {
	// FIXME: We could use a placeholder here, however NamedMDNode are
	// taking MDNode as operand and not using the Metadata infrastructure.
	// It is acknowledged by 'TODO: Inherit from Metadata' in the
	// NamedMDNode class definition.
	MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
	assert(MD && "Invalid record");
	NMD->addOperand(MD);
	}
	break;
	}
	case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
	// FIXME: we need to do this early because we don't materialize global
	// value explicitly.
	IndexCursor.JumpToBit(CurrentPos);
	Record.clear();
	IndexCursor.readRecord(Entry.ID, Record);
	if (Record.size() % 2 == 0)
	return error("Invalid record");
	unsigned ValueID = Record[0];
	if (ValueID >= ValueList.size())
	return error("Invalid record");
	if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
	if (Error Err = parseGlobalObjectAttachment(
	*GO, ArrayRef<uint64_t>(Record).slice(1)))
	return std::move(Err);
	break;
	}
	case bitc::METADATA_KIND:
	case bitc::METADATA_STRING_OLD:
	case bitc::METADATA_OLD_FN_NODE:
	case bitc::METADATA_OLD_NODE:
	case bitc::METADATA_VALUE:
	case bitc::METADATA_DISTINCT_NODE:
	case bitc::METADATA_NODE:
	case bitc::METADATA_LOCATION:
	case bitc::METADATA_GENERIC_DEBUG:
	case bitc::METADATA_SUBRANGE:
	case bitc::METADATA_ENUMERATOR:
	case bitc::METADATA_BASIC_TYPE:
	case bitc::METADATA_DERIVED_TYPE:
	case bitc::METADATA_COMPOSITE_TYPE:
	case bitc::METADATA_SUBROUTINE_TYPE:
	case bitc::METADATA_MODULE:
	case bitc::METADATA_FILE:
	case bitc::METADATA_COMPILE_UNIT:
	case bitc::METADATA_SUBPROGRAM:
	case bitc::METADATA_LEXICAL_BLOCK:
	case bitc::METADATA_LEXICAL_BLOCK_FILE:
	case bitc::METADATA_NAMESPACE:
	case bitc::METADATA_MACRO:
	case bitc::METADATA_MACRO_FILE:
	case bitc::METADATA_TEMPLATE_TYPE:
	case bitc::METADATA_TEMPLATE_VALUE:
	case bitc::METADATA_GLOBAL_VAR:
	case bitc::METADATA_LOCAL_VAR:
	case bitc::METADATA_EXPRESSION:
	case bitc::METADATA_OBJC_PROPERTY:
	case bitc::METADATA_IMPORTED_ENTITY:
	case bitc::METADATA_GLOBAL_VAR_EXPR:
	// We don't expect to see any of these, if we see one, give up on
	// lazy-loading and fallback.
	MDStringRef.clear();
	GlobalMetadataBitPosIndex.clear();
	return false;
	}
	break;
	}
	}
	}
	}

	/// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
	/// module level metadata.
	Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
	if (!ModuleLevel && MetadataList.hasFwdRefs())
	return error("Invalid metadata: fwd refs into function blocks");

	// Record the entry position so that we can jump back here and efficiently
	// skip the whole block in case we lazy-load.
	auto EntryPos = Stream.GetCurrentBitNo();

	if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
	return error("Invalid record");

	SmallVector<uint64_t, 64> Record;
	PlaceholderQueue Placeholders;

	// We lazy-load module-level metadata: we build an index for each record, and
	// then load individual record as needed, starting with the named metadata.
	if (ModuleLevel && IsImporting && MetadataList.empty() &&
	!DisableLazyLoading) {
	auto SuccessOrErr = lazyLoadModuleMetadataBlock();
	if (!SuccessOrErr)
	return SuccessOrErr.takeError();
	if (SuccessOrErr.get()) {
	// An index was successfully created and we will be able to load metadata
	// on-demand.
	MetadataList.resize(MDStringRef.size() +
	GlobalMetadataBitPosIndex.size());

	// Reading the named metadata created forward references and/or
	// placeholders, that we flush here.
	resolveForwardRefsAndPlaceholders(Placeholders);
	- upgradeCUSubprograms();
	+ upgradeDebugInfo();
	// Return at the beginning of the block, since it is easy to skip it
	// entirely from there.
	Stream.ReadBlockEnd(); // Pop the abbrev block context.
	Stream.JumpToBit(EntryPos);
	if (Stream.SkipBlock())
	return error("Invalid record");
	return Error::success();
	}
	// Couldn't load an index, fallback to loading all the block "old-style".
	}

	unsigned NextMetadataNo = MetadataList.size();

	// Read all the records.
	while (true) {
	BitstreamEntry Entry = Stream.advanceSkippingSubblocks();

	switch (Entry.Kind) {
	case BitstreamEntry::SubBlock: // Handled for us already.
	case BitstreamEntry::Error:
	return error("Malformed block");
	case BitstreamEntry::EndBlock:
	resolveForwardRefsAndPlaceholders(Placeholders);
	- upgradeCUSubprograms();
	+ upgradeDebugInfo();
	return Error::success();
	case BitstreamEntry::Record:
	// The interesting case.
	break;
	}

	// Read a record.
	Record.clear();
	StringRef Blob;
	++NumMDRecordLoaded;
	unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
	if (Error Err =
	parseOneMetadata(Record, Code, Placeholders, Blob, NextMetadataNo))
	return Err;
	}
	}

	MDString *MetadataLoader::MetadataLoaderImpl::lazyLoadOneMDString(unsigned ID) {
	++NumMDStringLoaded;
	if (Metadata *MD = MetadataList.lookup(ID))
	return cast<MDString>(MD);
	auto MDS = MDString::get(Context, MDStringRef[ID]);
	MetadataList.assignValue(MDS, ID);
	return MDS;
	}

	void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
	unsigned ID, PlaceholderQueue &Placeholders) {
	assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
	assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
	// Lookup first if the metadata hasn't already been loaded.
	if (auto *MD = MetadataList.lookup(ID)) {
	auto *N = dyn_cast_or_null<MDNode>(MD);
	if (!N->isTemporary())
	return;
	}
	SmallVector<uint64_t, 64> Record;
	StringRef Blob;
	IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
	auto Entry = IndexCursor.advanceSkippingSubblocks();
	++NumMDRecordLoaded;
	unsigned Code = IndexCursor.readRecord(Entry.ID, Record, &Blob);
	if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, ID))
	report_fatal_error("Can't lazyload MD");
	}

	/// Ensure that all forward-references and placeholders are resolved.
	/// Iteratively lazy-loading metadata on-demand if needed.
	void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
	PlaceholderQueue &Placeholders) {
	DenseSet<unsigned> Temporaries;
	while (1) {
	// Populate Temporaries with the placeholders that haven't been loaded yet.
	Placeholders.getTemporaries(MetadataList, Temporaries);

	// If we don't have any temporary, or FwdReference, we're done!
	if (Temporaries.empty() && !MetadataList.hasFwdRefs())
	break;

	// First, load all the temporaries. This can add new placeholders or
	// forward references.
	for (auto ID : Temporaries)
	lazyLoadOneMetadata(ID, Placeholders);
	Temporaries.clear();

	// Second, load the forward-references. This can also add new placeholders
	// or forward references.
	while (MetadataList.hasFwdRefs())
	lazyLoadOneMetadata(MetadataList.getNextFwdRef(), Placeholders);
	}
	// At this point we don't have any forward reference remaining, or temporary
	// that haven't been loaded. We can safely drop RAUW support and mark cycles
	// as resolved.
	MetadataList.tryToResolveCycles();

	// Finally, everything is in place, we can replace the placeholders operands
	// with the final node they refer to.
	Placeholders.flush(MetadataList);
	}

	Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
	SmallVectorImpl<uint64_t> &Record, unsigned Code,
	PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) {

	bool IsDistinct = false;
	auto getMD = [&](unsigned ID) -> Metadata * {
	if (ID < MDStringRef.size())
	return lazyLoadOneMDString(ID);
	if (!IsDistinct) {
	if (auto *MD = MetadataList.lookup(ID))
	return MD;
	// If lazy-loading is enabled, we try recursively to load the operand
	// instead of creating a temporary.
	if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
	// Create a temporary for the node that is referencing the operand we
	// will lazy-load. It is needed before recursing in case there are
	// uniquing cycles.
	MetadataList.getMetadataFwdRef(NextMetadataNo);
	lazyLoadOneMetadata(ID, Placeholders);
	return MetadataList.lookup(ID);
	}
	// Return a temporary.
	return MetadataList.getMetadataFwdRef(ID);
	}
	if (auto *MD = MetadataList.getMetadataIfResolved(ID))
	return MD;
	return &Placeholders.getPlaceholderOp(ID);
	};
	auto getMDOrNull = [&](unsigned ID) -> Metadata * {
	if (ID)
	return getMD(ID - 1);
	return nullptr;
	};
	auto getMDOrNullWithoutPlaceholders = [&](unsigned ID) -> Metadata * {
	if (ID)
	return MetadataList.getMetadataFwdRef(ID - 1);
	return nullptr;
	};
	auto getMDString = [&](unsigned ID) -> MDString * {
	// This requires that the ID is not really a forward reference. In
	// particular, the MDString must already have been resolved.
	auto MDS = getMDOrNull(ID);
	return cast_or_null<MDString>(MDS);
	};

	// Support for old type refs.
	auto getDITypeRefOrNull = [&](unsigned ID) {
	return MetadataList.upgradeTypeRef(getMDOrNull(ID));
	};

	#define GET_OR_DISTINCT(CLASS, ARGS) \
	(IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)

	switch (Code) {
	default: // Default behavior: ignore.
	break;
	case bitc::METADATA_NAME: {
	// Read name of the named metadata.
	SmallString<8> Name(Record.begin(), Record.end());
	Record.clear();
	Code = Stream.ReadCode();

	++NumMDRecordLoaded;
	unsigned NextBitCode = Stream.readRecord(Code, Record);
	if (NextBitCode != bitc::METADATA_NAMED_NODE)
	return error("METADATA_NAME not followed by METADATA_NAMED_NODE");

	// Read named metadata elements.
	unsigned Size = Record.size();
	NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
	for (unsigned i = 0; i != Size; ++i) {
	MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
	if (!MD)
	return error("Invalid record");
	NMD->addOperand(MD);
	}
	break;
	}
	case bitc::METADATA_OLD_FN_NODE: {
	// FIXME: Remove in 4.0.
	// This is a LocalAsMetadata record, the only type of function-local
	// metadata.
	if (Record.size() % 2 == 1)
	return error("Invalid record");

	// If this isn't a LocalAsMetadata record, we're dropping it. This used
	// to be legal, but there's no upgrade path.
	auto dropRecord = [&] {
	MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo);
	NextMetadataNo++;
	};
	if (Record.size() != 2) {
	dropRecord();
	break;
	}

	Type *Ty = getTypeByID(Record[0]);
	if (Ty->isMetadataTy() \|\| Ty->isVoidTy()) {
	dropRecord();
	break;
	}

	MetadataList.assignValue(
	LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_OLD_NODE: {
	// FIXME: Remove in 4.0.
	if (Record.size() % 2 == 1)
	return error("Invalid record");

	unsigned Size = Record.size();
	SmallVector<Metadata *, 8> Elts;
	for (unsigned i = 0; i != Size; i += 2) {
	Type *Ty = getTypeByID(Record[i]);
	if (!Ty)
	return error("Invalid record");
	if (Ty->isMetadataTy())
	Elts.push_back(getMD(Record[i + 1]));
	else if (!Ty->isVoidTy()) {
	auto *MD =
	ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
	assert(isa<ConstantAsMetadata>(MD) &&
	"Expected non-function-local metadata");
	Elts.push_back(MD);
	} else
	Elts.push_back(nullptr);
	}
	MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_VALUE: {
	if (Record.size() != 2)
	return error("Invalid record");

	Type *Ty = getTypeByID(Record[0]);
	if (Ty->isMetadataTy() \|\| Ty->isVoidTy())
	return error("Invalid record");

	MetadataList.assignValue(
	ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_DISTINCT_NODE:
	IsDistinct = true;
	LLVM_FALLTHROUGH;
	case bitc::METADATA_NODE: {
	SmallVector<Metadata *, 8> Elts;
	Elts.reserve(Record.size());
	for (unsigned ID : Record)
	Elts.push_back(getMDOrNull(ID));
	MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
	: MDNode::get(Context, Elts),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_LOCATION: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	unsigned Line = Record[1];
	unsigned Column = Record[2];
	Metadata *Scope = getMD(Record[3]);
	Metadata *InlinedAt = getMDOrNull(Record[4]);
	MetadataList.assignValue(
	GET_OR_DISTINCT(DILocation, (Context, Line, Column, Scope, InlinedAt)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_GENERIC_DEBUG: {
	if (Record.size() < 4)
	return error("Invalid record");

	IsDistinct = Record[0];
	unsigned Tag = Record[1];
	unsigned Version = Record[2];

	if (Tag >= 1u << 16 \|\| Version != 0)
	return error("Invalid record");

	auto *Header = getMDString(Record[3]);
	SmallVector<Metadata *, 8> DwarfOps;
	for (unsigned I = 4, E = Record.size(); I != E; ++I)
	DwarfOps.push_back(getMDOrNull(Record[I]));
	MetadataList.assignValue(
	GET_OR_DISTINCT(GenericDINode, (Context, Tag, Header, DwarfOps)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_SUBRANGE: {
	if (Record.size() != 3)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DISubrange,
	(Context, Record[1], unrotateSign(Record[2]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_ENUMERATOR: {
	if (Record.size() != 3)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
	getMDString(Record[2]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_BASIC_TYPE: {
	if (Record.size() != 6)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIBasicType,
	(Context, Record[1], getMDString(Record[2]), Record[3],
	Record[4], Record[5])),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_DERIVED_TYPE: {
	if (Record.size() != 12)
	return error("Invalid record");

	IsDistinct = Record[0];
	DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIDerivedType,
	(Context, Record[1], getMDString(Record[2]),
	getMDOrNull(Record[3]), Record[4],
	getDITypeRefOrNull(Record[5]),
	getDITypeRefOrNull(Record[6]), Record[7], Record[8],
	Record[9], Flags, getDITypeRefOrNull(Record[11]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_COMPOSITE_TYPE: {
	if (Record.size() != 16)
	return error("Invalid record");

	// If we have a UUID and this is not a forward declaration, lookup the
	// mapping.
	IsDistinct = Record[0] & 0x1;
	bool IsNotUsedInTypeRef = Record[0] >= 2;
	unsigned Tag = Record[1];
	MDString *Name = getMDString(Record[2]);
	Metadata *File = getMDOrNull(Record[3]);
	unsigned Line = Record[4];
	Metadata *Scope = getDITypeRefOrNull(Record[5]);
	Metadata *BaseType = nullptr;
	uint64_t SizeInBits = Record[7];
	if (Record[8] > (uint64_t)std::numeric_limits<uint32_t>::max())
	return error("Alignment value is too large");
	uint32_t AlignInBits = Record[8];
	uint64_t OffsetInBits = 0;
	DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
	Metadata *Elements = nullptr;
	unsigned RuntimeLang = Record[12];
	Metadata *VTableHolder = nullptr;
	Metadata *TemplateParams = nullptr;
	auto *Identifier = getMDString(Record[15]);
	// If this module is being parsed so that it can be ThinLTO imported
	// into another module, composite types only need to be imported
	// as type declarations (unless full type definitions requested).
	// Create type declarations up front to save memory. Also, buildODRType
	// handles the case where this is type ODRed with a definition needed
	// by the importing module, in which case the existing definition is
	// used.
	if (IsImporting && !ImportFullTypeDefinitions && Identifier &&
	(Tag == dwarf::DW_TAG_enumeration_type \|\|
	Tag == dwarf::DW_TAG_class_type \|\|
	Tag == dwarf::DW_TAG_structure_type \|\|
	Tag == dwarf::DW_TAG_union_type)) {
	Flags = Flags \| DINode::FlagFwdDecl;
	} else {
	BaseType = getDITypeRefOrNull(Record[6]);
	OffsetInBits = Record[9];
	Elements = getMDOrNull(Record[11]);
	VTableHolder = getDITypeRefOrNull(Record[13]);
	TemplateParams = getMDOrNull(Record[14]);
	}
	DICompositeType *CT = nullptr;
	if (Identifier)
	CT = DICompositeType::buildODRType(
	Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
	SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
	VTableHolder, TemplateParams);

	// Create a node if we didn't get a lazy ODR type.
	if (!CT)
	CT = GET_OR_DISTINCT(DICompositeType,
	(Context, Tag, Name, File, Line, Scope, BaseType,
	SizeInBits, AlignInBits, OffsetInBits, Flags,
	Elements, RuntimeLang, VTableHolder, TemplateParams,
	Identifier));
	if (!IsNotUsedInTypeRef && Identifier)
	MetadataList.addTypeRef(Identifier, cast<DICompositeType>(CT));

	MetadataList.assignValue(CT, NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_SUBROUTINE_TYPE: {
	if (Record.size() < 3 \|\| Record.size() > 4)
	return error("Invalid record");
	bool IsOldTypeRefArray = Record[0] < 2;
	unsigned CC = (Record.size() > 3) ? Record[3] : 0;

	IsDistinct = Record[0] & 0x1;
	DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[1]);
	Metadata *Types = getMDOrNull(Record[2]);
	if (LLVM_UNLIKELY(IsOldTypeRefArray))
	Types = MetadataList.upgradeTypeRefArray(Types);

	MetadataList.assignValue(
	GET_OR_DISTINCT(DISubroutineType, (Context, Flags, CC, Types)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}

	case bitc::METADATA_MODULE: {
	if (Record.size() != 6)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIModule,
	(Context, getMDOrNull(Record[1]),
	getMDString(Record[2]), getMDString(Record[3]),
	getMDString(Record[4]), getMDString(Record[5]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}

	case bitc::METADATA_FILE: {
	if (Record.size() != 3 && Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(
	DIFile,
	(Context, getMDString(Record[1]), getMDString(Record[2]),
	Record.size() == 3 ? DIFile::CSK_None
	: static_cast<DIFile::ChecksumKind>(Record[3]),
	Record.size() == 3 ? nullptr : getMDString(Record[4]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_COMPILE_UNIT: {
	if (Record.size() < 14 \|\| Record.size() > 17)
	return error("Invalid record");

	// Ignore Record[0], which indicates whether this compile unit is
	// distinct. It's always distinct.
	IsDistinct = true;
	auto *CU = DICompileUnit::getDistinct(
	Context, Record[1], getMDOrNull(Record[2]), getMDString(Record[3]),
	Record[4], getMDString(Record[5]), Record[6], getMDString(Record[7]),
	Record[8], getMDOrNull(Record[9]), getMDOrNull(Record[10]),
	getMDOrNull(Record[12]), getMDOrNull(Record[13]),
	Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
	Record.size() <= 14 ? 0 : Record[14],
	Record.size() <= 16 ? true : Record[16]);

	MetadataList.assignValue(CU, NextMetadataNo);
	NextMetadataNo++;

	// Move the Upgrade the list of subprograms.
	if (Metadata *SPs = getMDOrNullWithoutPlaceholders(Record[11]))
	CUSubprograms.push_back({CU, SPs});
	break;
	}
	case bitc::METADATA_SUBPROGRAM: {
	if (Record.size() < 18 \|\| Record.size() > 20)
	return error("Invalid record");

	IsDistinct =
	(Record[0] & 1) \|\| Record[8]; // All definitions should be distinct.
	// Version 1 has a Function as Record[15].
	// Version 2 has removed Record[15].
	// Version 3 has the Unit as Record[15].
	// Version 4 added thisAdjustment.
	bool HasUnit = Record[0] >= 2;
	if (HasUnit && Record.size() < 19)
	return error("Invalid record");
	Metadata *CUorFn = getMDOrNull(Record[15]);
	unsigned Offset = Record.size() >= 19 ? 1 : 0;
	bool HasFn = Offset && !HasUnit;
	bool HasThisAdj = Record.size() >= 20;
	DISubprogram *SP = GET_OR_DISTINCT(
	DISubprogram, (Context,
	getDITypeRefOrNull(Record[1]), // scope
	getMDString(Record[2]), // name
	getMDString(Record[3]), // linkageName
	getMDOrNull(Record[4]), // file
	Record[5], // line
	getMDOrNull(Record[6]), // type
	Record[7], // isLocal
	Record[8], // isDefinition
	Record[9], // scopeLine
	getDITypeRefOrNull(Record[10]), // containingType
	Record[11], // virtuality
	Record[12], // virtualIndex
	HasThisAdj ? Record[19] : 0, // thisAdjustment
	static_cast<DINode::DIFlags>(Record[13] // flags
	),
	Record[14], // isOptimized
	HasUnit ? CUorFn : nullptr, // unit
	getMDOrNull(Record[15 + Offset]), // templateParams
	getMDOrNull(Record[16 + Offset]), // declaration
	getMDOrNull(Record[17 + Offset]) // variables
	));
	MetadataList.assignValue(SP, NextMetadataNo);
	NextMetadataNo++;

	// Upgrade sp->function mapping to function->sp mapping.
	if (HasFn) {
	if (auto *CMD = dyn_cast_or_null<ConstantAsMetadata>(CUorFn))
	if (auto *F = dyn_cast<Function>(CMD->getValue())) {
	if (F->isMaterializable())
	// Defer until materialized; unmaterialized functions may not have
	// metadata.
	FunctionsWithSPs[F] = SP;
	else if (!F->empty())
	F->setSubprogram(SP);
	}
	}
	break;
	}
	case bitc::METADATA_LEXICAL_BLOCK: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DILexicalBlock,
	(Context, getMDOrNull(Record[1]),
	getMDOrNull(Record[2]), Record[3], Record[4])),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_LEXICAL_BLOCK_FILE: {
	if (Record.size() != 4)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DILexicalBlockFile,
	(Context, getMDOrNull(Record[1]),
	getMDOrNull(Record[2]), Record[3])),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_NAMESPACE: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0] & 1;
	bool ExportSymbols = Record[0] & 2;
	MetadataList.assignValue(
	GET_OR_DISTINCT(DINamespace,
	(Context, getMDOrNull(Record[1]),
	getMDOrNull(Record[2]), getMDString(Record[3]),
	Record[4], ExportSymbols)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_MACRO: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIMacro,
	(Context, Record[1], Record[2], getMDString(Record[3]),
	getMDString(Record[4]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_MACRO_FILE: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIMacroFile,
	(Context, Record[1], Record[2], getMDOrNull(Record[3]),
	getMDOrNull(Record[4]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_TEMPLATE_TYPE: {
	if (Record.size() != 3)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
	(Context, getMDString(Record[1]),
	getDITypeRefOrNull(Record[2]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_TEMPLATE_VALUE: {
	if (Record.size() != 5)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DITemplateValueParameter,
	(Context, Record[1], getMDString(Record[2]),
	getDITypeRefOrNull(Record[3]),
	getMDOrNull(Record[4]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_GLOBAL_VAR: {
	if (Record.size() < 11 \|\| Record.size() > 12)
	return error("Invalid record");

	IsDistinct = Record[0] & 1;
	unsigned Version = Record[0] >> 1;

	if (Version == 1) {
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIGlobalVariable,
	(Context, getMDOrNull(Record[1]),
	getMDString(Record[2]), getMDString(Record[3]),
	getMDOrNull(Record[4]), Record[5],
	getDITypeRefOrNull(Record[6]), Record[7], Record[8],
	getMDOrNull(Record[10]), Record[11])),
	NextMetadataNo);
	NextMetadataNo++;
	} else if (Version == 0) {
	// Upgrade old metadata, which stored a global variable reference or a
	// ConstantInt here.
	Metadata *Expr = getMDOrNull(Record[9]);
	uint32_t AlignInBits = 0;
	if (Record.size() > 11) {
	if (Record[11] > (uint64_t)std::numeric_limits<uint32_t>::max())
	return error("Alignment value is too large");
	AlignInBits = Record[11];
	}
	GlobalVariable *Attach = nullptr;
	if (auto *CMD = dyn_cast_or_null<ConstantAsMetadata>(Expr)) {
	if (auto *GV = dyn_cast<GlobalVariable>(CMD->getValue())) {
	Attach = GV;
	Expr = nullptr;
	} else if (auto *CI = dyn_cast<ConstantInt>(CMD->getValue())) {
	Expr = DIExpression::get(Context,
	{dwarf::DW_OP_constu, CI->getZExtValue(),
	dwarf::DW_OP_stack_value});
	} else {
	Expr = nullptr;
	}
	}
	DIGlobalVariable *DGV = GET_OR_DISTINCT(
	DIGlobalVariable,
	(Context, getMDOrNull(Record[1]), getMDString(Record[2]),
	getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
	getDITypeRefOrNull(Record[6]), Record[7], Record[8],
	getMDOrNull(Record[10]), AlignInBits));

	- auto *DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
	- MetadataList.assignValue(DGVE, NextMetadataNo);
	- NextMetadataNo++;
	+ DIGlobalVariableExpression *DGVE = nullptr;
	+ if (Attach \|\| Expr)
	+ DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
	+ else
	+ NeedUpgradeToDIGlobalVariableExpression = true;
	if (Attach)
	Attach->addDebugInfo(DGVE);
	+
	+ auto *MDNode = Expr ? cast<Metadata>(DGVE) : cast<Metadata>(DGV);
	+ MetadataList.assignValue(MDNode, NextMetadataNo);
	+ NextMetadataNo++;
	} else
	return error("Invalid record");

	break;
	}
	case bitc::METADATA_LOCAL_VAR: {
	// 10th field is for the obseleted 'inlinedAt:' field.
	if (Record.size() < 8 \|\| Record.size() > 10)
	return error("Invalid record");

	IsDistinct = Record[0] & 1;
	bool HasAlignment = Record[0] & 2;
	// 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
	// DW_TAG_arg_variable, if we have alignment flag encoded it means, that
	// this is newer version of record which doesn't have artifical tag.
	bool HasTag = !HasAlignment && Record.size() > 8;
	DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
	uint32_t AlignInBits = 0;
	if (HasAlignment) {
	if (Record[8 + HasTag] > (uint64_t)std::numeric_limits<uint32_t>::max())
	return error("Alignment value is too large");
	AlignInBits = Record[8 + HasTag];
	}
	MetadataList.assignValue(
	GET_OR_DISTINCT(DILocalVariable,
	(Context, getMDOrNull(Record[1 + HasTag]),
	getMDString(Record[2 + HasTag]),
	getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
	getDITypeRefOrNull(Record[5 + HasTag]),
	Record[6 + HasTag], Flags, AlignInBits)),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_EXPRESSION: {
	if (Record.size() < 1)
	return error("Invalid record");

	IsDistinct = Record[0] & 1;
	bool HasOpFragment = Record[0] & 2;
	auto Elts = MutableArrayRef<uint64_t>(Record).slice(1);
	if (!HasOpFragment)
	if (unsigned N = Elts.size())
	if (N >= 3 && Elts[N - 3] == dwarf::DW_OP_bit_piece)
	Elts[N - 3] = dwarf::DW_OP_LLVM_fragment;

	MetadataList.assignValue(
	GET_OR_DISTINCT(DIExpression, (Context, makeArrayRef(Record).slice(1))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_GLOBAL_VAR_EXPR: {
	if (Record.size() != 3)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(GET_OR_DISTINCT(DIGlobalVariableExpression,
	(Context, getMDOrNull(Record[1]),
	getMDOrNull(Record[2]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_OBJC_PROPERTY: {
	if (Record.size() != 8)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIObjCProperty,
	(Context, getMDString(Record[1]),
	getMDOrNull(Record[2]), Record[3],
	getMDString(Record[4]), getMDString(Record[5]),
	Record[6], getDITypeRefOrNull(Record[7]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_IMPORTED_ENTITY: {
	if (Record.size() != 6)
	return error("Invalid record");

	IsDistinct = Record[0];
	MetadataList.assignValue(
	GET_OR_DISTINCT(DIImportedEntity,
	(Context, Record[1], getMDOrNull(Record[2]),
	getDITypeRefOrNull(Record[3]), Record[4],
	getMDString(Record[5]))),
	NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_STRING_OLD: {
	std::string String(Record.begin(), Record.end());

	// Test for upgrading !llvm.loop.
	HasSeenOldLoopTags \|= mayBeOldLoopAttachmentTag(String);
	++NumMDStringLoaded;
	Metadata *MD = MDString::get(Context, String);
	MetadataList.assignValue(MD, NextMetadataNo);
	NextMetadataNo++;
	break;
	}
	case bitc::METADATA_STRINGS: {
	auto CreateNextMDString = [&](StringRef Str) {
	++NumMDStringLoaded;
	MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo);
	NextMetadataNo++;
	};
	if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString))
	return Err;
	break;
	}
	case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
	if (Record.size() % 2 == 0)
	return error("Invalid record");
	unsigned ValueID = Record[0];
	if (ValueID >= ValueList.size())
	return error("Invalid record");
	if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
	if (Error Err = parseGlobalObjectAttachment(
	*GO, ArrayRef<uint64_t>(Record).slice(1)))
	return Err;
	break;
	}
	case bitc::METADATA_KIND: {
	// Support older bitcode files that had METADATA_KIND records in a
	// block with METADATA_BLOCK_ID.
	if (Error Err = parseMetadataKindRecord(Record))
	return Err;
	break;
	}
	}
	return Error::success();
	#undef GET_OR_DISTINCT
	}

	Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
	ArrayRef<uint64_t> Record, StringRef Blob,
	std::function<void(StringRef)> CallBack) {
	// All the MDStrings in the block are emitted together in a single
	// record. The strings are concatenated and stored in a blob along with
	// their sizes.
	if (Record.size() != 2)
	return error("Invalid record: metadata strings layout");

	unsigned NumStrings = Record[0];
	unsigned StringsOffset = Record[1];
	if (!NumStrings)
	return error("Invalid record: metadata strings with no strings");
	if (StringsOffset > Blob.size())
	return error("Invalid record: metadata strings corrupt offset");

	StringRef Lengths = Blob.slice(0, StringsOffset);
	SimpleBitstreamCursor R(Lengths);

	StringRef Strings = Blob.drop_front(StringsOffset);
	do {
	if (R.AtEndOfStream())
	return error("Invalid record: metadata strings bad length");

	unsigned Size = R.ReadVBR(6);
	if (Strings.size() < Size)
	return error("Invalid record: metadata strings truncated chars");

	CallBack(Strings.slice(0, Size));
	Strings = Strings.drop_front(Size);
	} while (--NumStrings);

	return Error::success();
	}

	Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment(
	GlobalObject &GO, ArrayRef<uint64_t> Record) {
	assert(Record.size() % 2 == 0);
	for (unsigned I = 0, E = Record.size(); I != E; I += 2) {
	auto K = MDKindMap.find(Record[I]);
	if (K == MDKindMap.end())
	return error("Invalid ID");
	MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]);
	if (!MD)
	return error("Invalid metadata attachment");
	GO.addMetadata(K->second, *MD);
	}
	return Error::success();
	}

	/// Parse metadata attachments.
	Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
	Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
	if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
	return error("Invalid record");

	SmallVector<uint64_t, 64> Record;
	PlaceholderQueue Placeholders;

	while (true) {
	BitstreamEntry Entry = Stream.advanceSkippingSubblocks();

	switch (Entry.Kind) {
	case BitstreamEntry::SubBlock: // Handled for us already.
	case BitstreamEntry::Error:
	return error("Malformed block");
	case BitstreamEntry::EndBlock:
	resolveForwardRefsAndPlaceholders(Placeholders);
	return Error::success();
	case BitstreamEntry::Record:
	// The interesting case.
	break;
	}

	// Read a metadata attachment record.
	Record.clear();
	++NumMDRecordLoaded;
	switch (Stream.readRecord(Entry.ID, Record)) {
	default: // Default behavior: ignore.
	break;
	case bitc::METADATA_ATTACHMENT: {
	unsigned RecordLength = Record.size();
	if (Record.empty())
	return error("Invalid record");
	if (RecordLength % 2 == 0) {
	// A function attachment.
	if (Error Err = parseGlobalObjectAttachment(F, Record))
	return Err;
	continue;
	}

	// An instruction attachment.
	Instruction *Inst = InstructionList[Record[0]];
	for (unsigned i = 1; i != RecordLength; i = i + 2) {
	unsigned Kind = Record[i];
	DenseMap<unsigned, unsigned>::iterator I = MDKindMap.find(Kind);
	if (I == MDKindMap.end())
	return error("Invalid ID");
	if (I->second == LLVMContext::MD_tbaa && StripTBAA)
	continue;

	auto Idx = Record[i + 1];
	if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
	!MetadataList.lookup(Idx)) {
	// Load the attachment if it is in the lazy-loadable range and hasn't
	// been loaded yet.
	lazyLoadOneMetadata(Idx, Placeholders);
	resolveForwardRefsAndPlaceholders(Placeholders);
	}

	Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
	if (isa<LocalAsMetadata>(Node))
	// Drop the attachment. This used to be legal, but there's no
	// upgrade path.
	break;
	MDNode *MD = dyn_cast_or_null<MDNode>(Node);
	if (!MD)
	return error("Invalid metadata attachment");

	if (HasSeenOldLoopTags && I->second == LLVMContext::MD_loop)
	MD = upgradeInstructionLoopAttachment(*MD);

	if (I->second == LLVMContext::MD_tbaa) {
	assert(!MD->isTemporary() && "should load MDs before attachments");
	MD = UpgradeTBAANode(*MD);
	}
	Inst->setMetadata(I->second, MD);
	}
	break;
	}
	}
	}
	}

	/// Parse a single METADATA_KIND record, inserting result in MDKindMap.
	Error MetadataLoader::MetadataLoaderImpl::parseMetadataKindRecord(
	SmallVectorImpl<uint64_t> &Record) {
	if (Record.size() < 2)
	return error("Invalid record");

	unsigned Kind = Record[0];
	SmallString<8> Name(Record.begin() + 1, Record.end());

	unsigned NewKind = TheModule.getMDKindID(Name.str());
	if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
	return error("Conflicting METADATA_KIND records");
	return Error::success();
	}

	/// Parse the metadata kinds out of the METADATA_KIND_BLOCK.
	Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() {
	if (Stream.EnterSubBlock(bitc::METADATA_KIND_BLOCK_ID))
	return error("Invalid record");

	SmallVector<uint64_t, 64> Record;

	// Read all the records.
	while (true) {
	BitstreamEntry Entry = Stream.advanceSkippingSubblocks();

	switch (Entry.Kind) {
	case BitstreamEntry::SubBlock: // Handled for us already.
	case BitstreamEntry::Error:
	return error("Malformed block");
	case BitstreamEntry::EndBlock:
	return Error::success();
	case BitstreamEntry::Record:
	// The interesting case.
	break;
	}

	// Read a record.
	Record.clear();
	++NumMDRecordLoaded;
	unsigned Code = Stream.readRecord(Entry.ID, Record);
	switch (Code) {
	default: // Default behavior: ignore.
	break;
	case bitc::METADATA_KIND: {
	if (Error Err = parseMetadataKindRecord(Record))
	return Err;
	break;
	}
	}
	}
	}

	MetadataLoader &MetadataLoader::operator=(MetadataLoader &&RHS) {
	Pimpl = std::move(RHS.Pimpl);
	return *this;
	}
	MetadataLoader::MetadataLoader(MetadataLoader &&RHS)
	: Pimpl(std::move(RHS.Pimpl)) {}

	MetadataLoader::~MetadataLoader() = default;
	MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
	BitcodeReaderValueList &ValueList,
	bool IsImporting,
	std::function<Type *(unsigned)> getTypeByID)
	: Pimpl(llvm::make_unique<MetadataLoaderImpl>(Stream, TheModule, ValueList,
	getTypeByID, IsImporting)) {}

	Error MetadataLoader::parseMetadata(bool ModuleLevel) {
	return Pimpl->parseMetadata(ModuleLevel);
	}

	bool MetadataLoader::hasFwdRefs() const { return Pimpl->hasFwdRefs(); }

	/// Return the given metadata, creating a replaceable forward reference if
	/// necessary.
	Metadata *MetadataLoader::getMetadataFwdRefOrLoad(unsigned Idx) {
	return Pimpl->getMetadataFwdRefOrLoad(Idx);
	}

	MDNode *MetadataLoader::getMDNodeFwdRefOrNull(unsigned Idx) {
	return Pimpl->getMDNodeFwdRefOrNull(Idx);
	}

	DISubprogram MetadataLoader::lookupSubprogramForFunction(Function F) {
	return Pimpl->lookupSubprogramForFunction(F);
	}

	Error MetadataLoader::parseMetadataAttachment(
	Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
	return Pimpl->parseMetadataAttachment(F, InstructionList);
	}

	Error MetadataLoader::parseMetadataKinds() {
	return Pimpl->parseMetadataKinds();
	}

	void MetadataLoader::setStripTBAA(bool StripTBAA) {
	return Pimpl->setStripTBAA(StripTBAA);
	}

	bool MetadataLoader::isStrippingTBAA() { return Pimpl->isStrippingTBAA(); }

	unsigned MetadataLoader::size() const { return Pimpl->size(); }
	void MetadataLoader::shrinkTo(unsigned N) { return Pimpl->shrinkTo(N); }
	Index: projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 313643)
	@@ -1,1933 +1,1896 @@
	//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass forwards branches to unconditional branches to make them branch
	// directly to the target block. This pass often results in dead MBB's, which
	// it then removes.
	//
	// Note that this pass must be run after register allocation, it cannot handle
	// SSA form. It also must handle virtual registers for targets that emit virtual
	// ISA (e.g. NVPTX).
	//
	//===----------------------------------------------------------------------===//

	#include "BranchFolding.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
	#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/Function.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	using namespace llvm;

	#define DEBUG_TYPE "branchfolding"

	STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
	STATISTIC(NumBranchOpts, "Number of branches optimized");
	STATISTIC(NumTailMerge , "Number of block tails merged");
	STATISTIC(NumHoist , "Number of times common instructions are hoisted");
	-STATISTIC(NumTailCalls, "Number of tail calls optimized");

	static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
	cl::init(cl::BOU_UNSET), cl::Hidden);

	// Throttle for huge numbers of predecessors (compile speed problems)
	static cl::opt<unsigned>
	TailMergeThreshold("tail-merge-threshold",
	cl::desc("Max number of predecessors to consider tail merging"),
	cl::init(150), cl::Hidden);

	// Heuristic for tail merging (and, inversely, tail duplication).
	// TODO: This should be replaced with a target query.
	static cl::opt<unsigned>
	TailMergeSize("tail-merge-size",
	cl::desc("Min number of instructions to consider tail merging"),
	cl::init(3), cl::Hidden);

	namespace {
	/// BranchFolderPass - Wrap branch folder in a machine function pass.
	class BranchFolderPass : public MachineFunctionPass {
	public:
	static char ID;
	explicit BranchFolderPass(): MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<MachineBlockFrequencyInfo>();
	AU.addRequired<MachineBranchProbabilityInfo>();
	AU.addRequired<TargetPassConfig>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};
	}

	char BranchFolderPass::ID = 0;
	char &llvm::BranchFolderPassID = BranchFolderPass::ID;

	INITIALIZE_PASS(BranchFolderPass, "branch-folder",
	"Control Flow Optimizer", false, false)

	bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(*MF.getFunction()))
	return false;

	TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
	// TailMerge can create jump into if branches that make CFG irreducible for
	// HW that requires structurized CFG.
	bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
	PassConfig->getEnableTailMerge();
	BranchFolder::MBFIWrapper MBBFreqInfo(
	getAnalysis<MachineBlockFrequencyInfo>());
	BranchFolder Folder(EnableTailMerge, /CommonHoist=/true, MBBFreqInfo,
	getAnalysis<MachineBranchProbabilityInfo>());
	return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
	MF.getSubtarget().getRegisterInfo(),
	getAnalysisIfAvailable<MachineModuleInfo>());
	}

	BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
	MBFIWrapper &FreqInfo,
	const MachineBranchProbabilityInfo &ProbInfo,
	unsigned MinTailLength)
	: EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
	MBBFreqInfo(FreqInfo), MBPI(ProbInfo) {
	if (MinCommonTailLength == 0)
	MinCommonTailLength = TailMergeSize;
	switch (FlagEnableTailMerge) {
	case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
	case cl::BOU_TRUE: EnableTailMerge = true; break;
	case cl::BOU_FALSE: EnableTailMerge = false; break;
	}
	}

	/// RemoveDeadBlock - Remove the specified dead machine basic block from the
	/// function, updating the CFG.
	void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
	assert(MBB->pred_empty() && "MBB must be dead!");
	DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);

	MachineFunction *MF = MBB->getParent();
	// drop all successors.
	while (!MBB->succ_empty())
	MBB->removeSuccessor(MBB->succ_end()-1);

	// Avoid matching if this pointer gets reused.
	TriedMerging.erase(MBB);

	// Remove the block.
	MF->erase(MBB);
	FuncletMembership.erase(MBB);
	if (MLI)
	MLI->removeBlock(MBB);
	}

	/// OptimizeFunction - Perhaps branch folding, tail merging and other
	/// CFG optimizations on the given function. Block placement changes the layout
	/// and may create new tail merging opportunities.
	bool BranchFolder::OptimizeFunction(MachineFunction &MF,
	const TargetInstrInfo *tii,
	const TargetRegisterInfo *tri,
	MachineModuleInfo *mmi,
	MachineLoopInfo *mli, bool AfterPlacement) {
	if (!tii) return false;

	TriedMerging.clear();

	AfterBlockPlacement = AfterPlacement;
	TII = tii;
	TRI = tri;
	MMI = mmi;
	MLI = mli;

	MachineRegisterInfo &MRI = MF.getRegInfo();
	UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
	if (!UpdateLiveIns)
	MRI.invalidateLiveness();

	// Fix CFG. The later algorithms expect it to be right.
	bool MadeChange = false;
	for (MachineBasicBlock &MBB : MF) {
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
	MadeChange \|= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
	}

	// Recalculate funclet membership.
	FuncletMembership = getFuncletMembership(MF);

	bool MadeChangeThisIteration = true;
	while (MadeChangeThisIteration) {
	MadeChangeThisIteration = TailMergeBlocks(MF);
	// No need to clean up if tail merging does not change anything after the
	// block placement.
	if (!AfterBlockPlacement \|\| MadeChangeThisIteration)
	MadeChangeThisIteration \|= OptimizeBranches(MF);
	if (EnableHoistCommonCode)
	MadeChangeThisIteration \|= HoistCommonCode(MF);
	MadeChange \|= MadeChangeThisIteration;
	}

	// See if any jump tables have become dead as the code generator
	// did its thing.
	MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
	if (!JTI)
	return MadeChange;

	// Walk the function to find jump tables that are live.
	BitVector JTIsLive(JTI->getJumpTables().size());
	for (const MachineBasicBlock &BB : MF) {
	for (const MachineInstr &I : BB)
	for (const MachineOperand &Op : I.operands()) {
	if (!Op.isJTI()) continue;

	// Remember that this JT is live.
	JTIsLive.set(Op.getIndex());
	}
	}

	// Finally, remove dead jump tables. This happens when the
	// indirect jump was unreachable (and thus deleted).
	for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
	if (!JTIsLive.test(i)) {
	JTI->RemoveJumpTable(i);
	MadeChange = true;
	}

	return MadeChange;
	}

	//===----------------------------------------------------------------------===//
	// Tail Merging of Blocks
	//===----------------------------------------------------------------------===//

	/// HashMachineInstr - Compute a hash value for MI and its operands.
	static unsigned HashMachineInstr(const MachineInstr &MI) {
	unsigned Hash = MI.getOpcode();
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	const MachineOperand &Op = MI.getOperand(i);

	// Merge in bits from the operand if easy. We can't use MachineOperand's
	// hash_code here because it's not deterministic and we sort by hash value
	// later.
	unsigned OperandHash = 0;
	switch (Op.getType()) {
	case MachineOperand::MO_Register:
	OperandHash = Op.getReg();
	break;
	case MachineOperand::MO_Immediate:
	OperandHash = Op.getImm();
	break;
	case MachineOperand::MO_MachineBasicBlock:
	OperandHash = Op.getMBB()->getNumber();
	break;
	case MachineOperand::MO_FrameIndex:
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_JumpTableIndex:
	OperandHash = Op.getIndex();
	break;
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	// Global address / external symbol are too hard, don't bother, but do
	// pull in the offset.
	OperandHash = Op.getOffset();
	break;
	default:
	break;
	}

	Hash += ((OperandHash << 3) \| Op.getType()) << (i & 31);
	}
	return Hash;
	}

	/// HashEndOfMBB - Hash the last instruction in the MBB.
	static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
	MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	return HashMachineInstr(*I);
	}

	/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
	/// of instructions they actually have in common together at their end. Return
	/// iterators for the first shared instruction in each block.
	static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
	MachineBasicBlock *MBB2,
	MachineBasicBlock::iterator &I1,
	MachineBasicBlock::iterator &I2) {
	I1 = MBB1->end();
	I2 = MBB2->end();

	unsigned TailLen = 0;
	while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
	--I1; --I2;
	// Skip debugging pseudos; necessary to avoid changing the code.
	while (I1->isDebugValue()) {
	if (I1==MBB1->begin()) {
	while (I2->isDebugValue()) {
	if (I2==MBB2->begin())
	// I1==DBG at begin; I2==DBG at begin
	return TailLen;
	--I2;
	}
	++I2;
	// I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
	return TailLen;
	}
	--I1;
	}
	// I1==first (untested) non-DBG preceding known match
	while (I2->isDebugValue()) {
	if (I2==MBB2->begin()) {
	++I1;
	// I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
	return TailLen;
	}
	--I2;
	}
	// I1, I2==first (untested) non-DBGs preceding known match
	if (!I1->isIdenticalTo(*I2) \|\|
	// FIXME: This check is dubious. It's used to get around a problem where
	// people incorrectly expect inline asm directives to remain in the same
	// relative order. This is untenable because normal compiler
	// optimizations (like this one) may reorder and/or merge these
	// directives.
	I1->isInlineAsm()) {
	++I1; ++I2;
	break;
	}
	++TailLen;
	}
	// Back past possible debugging pseudos at beginning of block. This matters
	// when one block differs from the other only by whether debugging pseudos
	// are present at the beginning. (This way, the various checks later for
	// I1==MBB1->begin() work as expected.)
	if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
	--I2;
	while (I2->isDebugValue()) {
	if (I2 == MBB2->begin())
	return TailLen;
	--I2;
	}
	++I2;
	}
	if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
	--I1;
	while (I1->isDebugValue()) {
	if (I1 == MBB1->begin())
	return TailLen;
	--I1;
	}
	++I1;
	}
	return TailLen;
	}

	/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
	/// after it, replacing it with an unconditional branch to NewDest.
	void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
	MachineBasicBlock *NewDest) {
	TII->ReplaceTailWithBranchTo(OldInst, NewDest);

	if (UpdateLiveIns) {
	NewDest->clearLiveIns();
	computeLiveIns(LiveRegs, TRI, NewDest);
	}

	++NumTailMerge;
	}

	/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
	/// MBB so that the part before the iterator falls into the part starting at the
	/// iterator. This returns the new MBB.
	MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
	MachineBasicBlock::iterator BBI1,
	const BasicBlock *BB) {
	if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
	return nullptr;

	MachineFunction &MF = *CurMBB.getParent();

	// Create the fall-through block.
	MachineFunction::iterator MBBI = CurMBB.getIterator();
	MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
	CurMBB.getParent()->insert(++MBBI, NewMBB);

	// Move all the successors of this block to the specified block.
	NewMBB->transferSuccessors(&CurMBB);

	// Add an edge from CurMBB to NewMBB for the fall-through.
	CurMBB.addSuccessor(NewMBB);

	// Splice the code over.
	NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());

	// NewMBB belongs to the same loop as CurMBB.
	if (MLI)
	if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
	ML->addBasicBlockToLoop(NewMBB, MLI->getBase());

	// NewMBB inherits CurMBB's block frequency.
	MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));

	if (UpdateLiveIns)
	computeLiveIns(LiveRegs, TRI, NewMBB);

	// Add the new block to the funclet.
	const auto &FuncletI = FuncletMembership.find(&CurMBB);
	if (FuncletI != FuncletMembership.end()) {
	auto n = FuncletI->second;
	FuncletMembership[NewMBB] = n;
	}

	return NewMBB;
	}

	/// EstimateRuntime - Make a rough estimate for how long it will take to run
	/// the specified code.
	static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator E) {
	unsigned Time = 0;
	for (; I != E; ++I) {
	if (I->isDebugValue())
	continue;
	if (I->isCall())
	Time += 10;
	else if (I->mayLoad() \|\| I->mayStore())
	Time += 2;
	else
	++Time;
	}
	return Time;
	}

	// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
	// branches temporarily for tail merging). In the case where CurMBB ends
	// with a conditional branch to the next block, optimize by reversing the
	// test and conditionally branching to SuccMBB instead.
	static void FixTail(MachineBasicBlock CurMBB, MachineBasicBlock SuccBB,
	const TargetInstrInfo *TII) {
	MachineFunction *MF = CurMBB->getParent();
	MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	DebugLoc dl; // FIXME: this is nowhere
	if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
	MachineBasicBlock NextBB = &I;
	if (TBB == NextBB && !Cond.empty() && !FBB) {
	if (!TII->reverseBranchCondition(Cond)) {
	TII->removeBranch(*CurMBB);
	TII->insertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
	return;
	}
	}
	}
	TII->insertBranch(*CurMBB, SuccBB, nullptr,
	SmallVector<MachineOperand, 0>(), dl);
	}

	bool
	BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
	if (getHash() < o.getHash())
	return true;
	if (getHash() > o.getHash())
	return false;
	if (getBlock()->getNumber() < o.getBlock()->getNumber())
	return true;
	if (getBlock()->getNumber() > o.getBlock()->getNumber())
	return false;
	// _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
	// an object with itself.
	#ifndef _GLIBCXX_DEBUG
	llvm_unreachable("Predecessor appears twice");
	#else
	return false;
	#endif
	}

	BlockFrequency
	BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
	auto I = MergedBBFreq.find(MBB);

	if (I != MergedBBFreq.end())
	return I->second;

	return MBFI.getBlockFreq(MBB);
	}

	void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
	BlockFrequency F) {
	MergedBBFreq[MBB] = F;
	}

	raw_ostream &
	BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
	const MachineBasicBlock *MBB) const {
	return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
	}

	raw_ostream &
	BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
	const BlockFrequency Freq) const {
	return MBFI.printBlockFreq(OS, Freq);
	}

	/// CountTerminators - Count the number of terminators in the given
	/// block and set I to the position of the first non-terminator, if there
	/// is one, or MBB->end() otherwise.
	static unsigned CountTerminators(MachineBasicBlock *MBB,
	MachineBasicBlock::iterator &I) {
	I = MBB->end();
	unsigned NumTerms = 0;
	for (;;) {
	if (I == MBB->begin()) {
	I = MBB->end();
	break;
	}
	--I;
	if (!I->isTerminator()) break;
	++NumTerms;
	}
	return NumTerms;
	}

	/// ProfitableToMerge - Check if two machine basic blocks have a common tail
	/// and decide if it would be profitable to merge those tails. Return the
	/// length of the common tail and iterators to the first common instruction
	/// in each block.
	/// MBB1, MBB2 The blocks to check
	/// MinCommonTailLength Minimum size of tail block to be merged.
	/// CommonTailLen Out parameter to record the size of the shared tail between
	/// MBB1 and MBB2
	/// I1, I2 Iterator references that will be changed to point to the first
	/// instruction in the common tail shared by MBB1,MBB2
	/// SuccBB A common successor of MBB1, MBB2 which are in a canonical form
	/// relative to SuccBB
	/// PredBB The layout predecessor of SuccBB, if any.
	/// FuncletMembership map from block to funclet #.
	/// AfterPlacement True if we are merging blocks after layout. Stricter
	/// thresholds apply to prevent undoing tail-duplication.
	static bool
	ProfitableToMerge(MachineBasicBlock MBB1, MachineBasicBlock MBB2,
	unsigned MinCommonTailLength, unsigned &CommonTailLen,
	MachineBasicBlock::iterator &I1,
	MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB,
	DenseMap<const MachineBasicBlock *, int> &FuncletMembership,
	bool AfterPlacement) {
	// It is never profitable to tail-merge blocks from two different funclets.
	if (!FuncletMembership.empty()) {
	auto Funclet1 = FuncletMembership.find(MBB1);
	assert(Funclet1 != FuncletMembership.end());
	auto Funclet2 = FuncletMembership.find(MBB2);
	assert(Funclet2 != FuncletMembership.end());
	if (Funclet1->second != Funclet2->second)
	return false;
	}

	CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
	if (CommonTailLen == 0)
	return false;
	DEBUG(dbgs() << "Common tail length of BB#" << MBB1->getNumber()
	<< " and BB#" << MBB2->getNumber() << " is " << CommonTailLen
	<< '\n');

	// It's almost always profitable to merge any number of non-terminator
	// instructions with the block that falls through into the common successor.
	// This is true only for a single successor. For multiple successors, we are
	// trading a conditional branch for an unconditional one.
	// TODO: Re-visit successor size for non-layout tail merging.
	if ((MBB1 == PredBB \|\| MBB2 == PredBB) &&
	(!AfterPlacement \|\| MBB1->succ_size() == 1)) {
	MachineBasicBlock::iterator I;
	unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
	if (CommonTailLen > NumTerms)
	return true;
	}

	// If one of the blocks can be completely merged and happens to be in
	// a position where the other could fall through into it, merge any number
	// of instructions, because it can be done without a branch.
	// TODO: If the blocks are not adjacent, move one of them so that they are?
	if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
	return true;
	if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
	return true;

	// If both blocks have an unconditional branch temporarily stripped out,
	// count that as an additional common instruction for the following
	// heuristics. This heuristic is only accurate for single-succ blocks, so to
	// make sure that during layout merging and duplicating don't crash, we check
	// for that when merging during layout.
	unsigned EffectiveTailLen = CommonTailLen;
	if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
	(MBB1->succ_size() == 1 \|\| !AfterPlacement) &&
	!MBB1->back().isBarrier() &&
	!MBB2->back().isBarrier())
	++EffectiveTailLen;

	// Check if the common tail is long enough to be worthwhile.
	if (EffectiveTailLen >= MinCommonTailLength)
	return true;

	// If we are optimizing for code size, 2 instructions in common is enough if
	// we don't have to split a block. At worst we will be introducing 1 new
	// branch instruction, which is likely to be smaller than the 2
	// instructions that would be deleted in the merge.
	MachineFunction *MF = MBB1->getParent();
	return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() &&
	(I1 == MBB1->begin() \|\| I2 == MBB2->begin());
	}

	/// ComputeSameTails - Look through all the blocks in MergePotentials that have
	/// hash CurHash (guaranteed to match the last element). Build the vector
	/// SameTails of all those that have the (same) largest number of instructions
	/// in common of any pair of these blocks. SameTails entries contain an
	/// iterator into MergePotentials (from which the MachineBasicBlock can be
	/// found) and a MachineBasicBlock::iterator into that MBB indicating the
	/// instruction where the matching code sequence begins.
	/// Order of elements in SameTails is the reverse of the order in which
	/// those blocks appear in MergePotentials (where they are not necessarily
	/// consecutive).
	unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
	unsigned MinCommonTailLength,
	MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB) {
	unsigned maxCommonTailLength = 0U;
	SameTails.clear();
	MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
	MPIterator HighestMPIter = std::prev(MergePotentials.end());
	for (MPIterator CurMPIter = std::prev(MergePotentials.end()),
	B = MergePotentials.begin();
	CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) {
	for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) {
	unsigned CommonTailLen;
	if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
	MinCommonTailLength,
	CommonTailLen, TrialBBI1, TrialBBI2,
	SuccBB, PredBB,
	FuncletMembership,
	AfterBlockPlacement)) {
	if (CommonTailLen > maxCommonTailLength) {
	SameTails.clear();
	maxCommonTailLength = CommonTailLen;
	HighestMPIter = CurMPIter;
	SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
	}
	if (HighestMPIter == CurMPIter &&
	CommonTailLen == maxCommonTailLength)
	SameTails.push_back(SameTailElt(I, TrialBBI2));
	}
	if (I == B)
	break;
	}
	}
	return maxCommonTailLength;
	}

	/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
	/// MergePotentials, restoring branches at ends of blocks as appropriate.
	void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
	MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB) {
	MPIterator CurMPIter, B;
	for (CurMPIter = std::prev(MergePotentials.end()),
	B = MergePotentials.begin();
	CurMPIter->getHash() == CurHash; --CurMPIter) {
	// Put the unconditional branch back, if we need one.
	MachineBasicBlock *CurMBB = CurMPIter->getBlock();
	if (SuccBB && CurMBB != PredBB)
	FixTail(CurMBB, SuccBB, TII);
	if (CurMPIter == B)
	break;
	}
	if (CurMPIter->getHash() != CurHash)
	CurMPIter++;
	MergePotentials.erase(CurMPIter, MergePotentials.end());
	}

	/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
	/// only of the common tail. Create a block that does by splitting one.
	bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
	MachineBasicBlock *SuccBB,
	unsigned maxCommonTailLength,
	unsigned &commonTailIndex) {
	commonTailIndex = 0;
	unsigned TimeEstimate = ~0U;
	for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
	// Use PredBB if possible; that doesn't require a new branch.
	if (SameTails[i].getBlock() == PredBB) {
	commonTailIndex = i;
	break;
	}
	// Otherwise, make a (fairly bogus) choice based on estimate of
	// how long it will take the various blocks to execute.
	unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
	SameTails[i].getTailStartPos());
	if (t <= TimeEstimate) {
	TimeEstimate = t;
	commonTailIndex = i;
	}
	}

	MachineBasicBlock::iterator BBI =
	SameTails[commonTailIndex].getTailStartPos();
	MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();

	DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
	<< maxCommonTailLength);

	// If the split block unconditionally falls-thru to SuccBB, it will be
	// merged. In control flow terms it should then take SuccBB's name. e.g. If
	// SuccBB is an inner loop, the common tail is still part of the inner loop.
	const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
	SuccBB->getBasicBlock() : MBB->getBasicBlock();
	MachineBasicBlock newMBB = SplitMBBAt(MBB, BBI, BB);
	if (!newMBB) {
	DEBUG(dbgs() << "... failed!");
	return false;
	}

	SameTails[commonTailIndex].setBlock(newMBB);
	SameTails[commonTailIndex].setTailStartPos(newMBB->begin());

	// If we split PredBB, newMBB is the new predecessor.
	if (PredBB == MBB)
	PredBB = newMBB;

	return true;
	}

	static void
	mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
	MachineBasicBlock &MBBCommon) {
	MachineBasicBlock *MBB = MBBIStartPos->getParent();
	// Note CommonTailLen does not necessarily matches the size of
	// the common BB nor all its instructions because of debug
	// instructions differences.
	unsigned CommonTailLen = 0;
	for (auto E = MBB->end(); MBBIStartPos != E; ++MBBIStartPos)
	++CommonTailLen;

	MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin();
	MachineBasicBlock::reverse_iterator MBBIE = MBB->rend();
	MachineBasicBlock::reverse_iterator MBBICommon = MBBCommon.rbegin();
	MachineBasicBlock::reverse_iterator MBBIECommon = MBBCommon.rend();

	while (CommonTailLen--) {
	assert(MBBI != MBBIE && "Reached BB end within common tail length!");
	(void)MBBIE;

	if (MBBI->isDebugValue()) {
	++MBBI;
	continue;
	}

	while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
	++MBBICommon;

	assert(MBBICommon != MBBIECommon &&
	"Reached BB end within common tail length!");
	assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");

	// Merge MMOs from memory operations in the common block.
	if (MBBICommon->mayLoad() \|\| MBBICommon->mayStore())
	MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
	// Drop undef flags if they aren't present in all merged instructions.
	for (unsigned I = 0, E = MBBICommon->getNumOperands(); I != E; ++I) {
	MachineOperand &MO = MBBICommon->getOperand(I);
	if (MO.isReg() && MO.isUndef()) {
	const MachineOperand &OtherMO = MBBI->getOperand(I);
	if (!OtherMO.isUndef())
	MO.setIsUndef(false);
	}
	}

	++MBBI;
	++MBBICommon;
	}
	}

	// See if any of the blocks in MergePotentials (which all have SuccBB as a
	// successor, or all have no successor if it is null) can be tail-merged.
	// If there is a successor, any blocks in MergePotentials that are not
	// tail-merged and are not immediately before Succ must have an unconditional
	// branch to Succ added (but the predecessor/successor lists need no
	// adjustment). The lone predecessor of Succ that falls through into Succ,
	// if any, is given in PredBB.
	// MinCommonTailLength - Except for the special cases below, tail-merge if
	// there are at least this many instructions in common.
	bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB,
	unsigned MinCommonTailLength) {
	bool MadeChange = false;

	DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
	<< (i == e-1 ? "" : ", ");
	dbgs() << "\n";
	if (SuccBB) {
	dbgs() << " with successor BB#" << SuccBB->getNumber() << '\n';
	if (PredBB)
	dbgs() << " which has fall-through from BB#"
	<< PredBB->getNumber() << "\n";
	}
	dbgs() << "Looking for common tails of at least "
	<< MinCommonTailLength << " instruction"
	<< (MinCommonTailLength == 1 ? "" : "s") << '\n';
	);

	// Sort by hash value so that blocks with identical end sequences sort
	// together.
	array_pod_sort(MergePotentials.begin(), MergePotentials.end());

	// Walk through equivalence sets looking for actual exact matches.
	while (MergePotentials.size() > 1) {
	unsigned CurHash = MergePotentials.back().getHash();

	// Build SameTails, identifying the set of blocks with this hash code
	// and with the maximum number of instructions in common.
	unsigned maxCommonTailLength = ComputeSameTails(CurHash,
	MinCommonTailLength,
	SuccBB, PredBB);

	// If we didn't find any pair that has at least MinCommonTailLength
	// instructions in common, remove all blocks with this hash code and retry.
	if (SameTails.empty()) {
	RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
	continue;
	}

	// If one of the blocks is the entire common tail (and not the entry
	// block, which we can't jump to), we can treat all blocks with this same
	// tail at once. Use PredBB if that is one of the possibilities, as that
	// will not introduce any extra branches.
	MachineBasicBlock *EntryBB =
	&MergePotentials.front().getBlock()->getParent()->front();
	unsigned commonTailIndex = SameTails.size();
	// If there are two blocks, check to see if one can be made to fall through
	// into the other.
	if (SameTails.size() == 2 &&
	SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
	SameTails[1].tailIsWholeBlock())
	commonTailIndex = 1;
	else if (SameTails.size() == 2 &&
	SameTails[1].getBlock()->isLayoutSuccessor(
	SameTails[0].getBlock()) &&
	SameTails[0].tailIsWholeBlock())
	commonTailIndex = 0;
	else {
	// Otherwise just pick one, favoring the fall-through predecessor if
	// there is one.
	for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
	MachineBasicBlock *MBB = SameTails[i].getBlock();
	if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
	continue;
	if (MBB == PredBB) {
	commonTailIndex = i;
	break;
	}
	if (SameTails[i].tailIsWholeBlock())
	commonTailIndex = i;
	}
	}

	if (commonTailIndex == SameTails.size() \|\|
	(SameTails[commonTailIndex].getBlock() == PredBB &&
	!SameTails[commonTailIndex].tailIsWholeBlock())) {
	// None of the blocks consist entirely of the common tail.
	// Split a block so that one does.
	if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
	maxCommonTailLength, commonTailIndex)) {
	RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
	continue;
	}
	}

	MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();

	// Recompute common tail MBB's edge weights and block frequency.
	setCommonTailEdgeWeights(*MBB);

	// Remove the original debug location from the common tail.
	for (auto &MI : *MBB)
	if (!MI.isDebugValue())
	MI.setDebugLoc(DebugLoc());

	// MBB is common tail. Adjust all other BB's to jump to this one.
	// Traversal must be forwards so erases work.
	DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
	<< " for ");
	for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
	if (commonTailIndex == i)
	continue;
	DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
	<< (i == e-1 ? "" : ", "));
	// Merge operations (MMOs, undef flags)
	mergeOperations(SameTails[i].getTailStartPos(), *MBB);
	// Hack the end off BB i, making it jump to BB commonTailIndex instead.
	ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
	// BB i is no longer a predecessor of SuccBB; remove it from the worklist.
	MergePotentials.erase(SameTails[i].getMPIter());
	}
	DEBUG(dbgs() << "\n");
	// We leave commonTailIndex in the worklist in case there are other blocks
	// that match it with a smaller number of instructions.
	MadeChange = true;
	}
	return MadeChange;
	}

	bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
	bool MadeChange = false;
	if (!EnableTailMerge) return MadeChange;

	// First find blocks with no successors.
	// Block placement does not create new tail merging opportunities for these
	// blocks.
	if (!AfterBlockPlacement) {
	MergePotentials.clear();
	for (MachineBasicBlock &MBB : MF) {
	if (MergePotentials.size() == TailMergeThreshold)
	break;
	if (!TriedMerging.count(&MBB) && MBB.succ_empty())
	MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
	}

	// If this is a large problem, avoid visiting the same basic blocks
	// multiple times.
	if (MergePotentials.size() == TailMergeThreshold)
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	TriedMerging.insert(MergePotentials[i].getBlock());

	// See if we can do any tail merging on those.
	if (MergePotentials.size() >= 2)
	MadeChange \|= TryTailMergeBlocks(nullptr, nullptr, MinCommonTailLength);
	}

	// Look at blocks (IBB) with multiple predecessors (PBB).
	// We change each predecessor to a canonical form, by
	// (1) temporarily removing any unconditional branch from the predecessor
	// to IBB, and
	// (2) alter conditional branches so they branch to the other block
	// not IBB; this may require adding back an unconditional branch to IBB
	// later, where there wasn't one coming in. E.g.
	// Bcc IBB
	// fallthrough to QBB
	// here becomes
	// Bncc QBB
	// with a conceptual B to IBB after that, which never actually exists.
	// With those changes, we see whether the predecessors' tails match,
	// and merge them if so. We change things out of canonical form and
	// back to the way they were later in the process. (OptimizeBranches
	// would undo some of this, but we can't use it, because we'd get into
	// a compile-time infinite loop repeatedly doing and undoing the same
	// transformations.)

	for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
	I != E; ++I) {
	if (I->pred_size() < 2) continue;
	SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
	MachineBasicBlock IBB = &I;
	MachineBasicBlock PredBB = &std::prev(I);
	MergePotentials.clear();
	MachineLoop *ML;

	// Bail if merging after placement and IBB is the loop header because
	// -- If merging predecessors that belong to the same loop as IBB, the
	// common tail of merged predecessors may become the loop top if block
	// placement is called again and the predecessors may branch to this common
	// tail and require more branches. This can be relaxed if
	// MachineBlockPlacement::findBestLoopTop is more flexible.
	// --If merging predecessors that do not belong to the same loop as IBB, the
	// loop info of IBB's loop and the other loops may be affected. Calling the
	// block placement again may make big change to the layout and eliminate the
	// reason to do tail merging here.
	if (AfterBlockPlacement && MLI) {
	ML = MLI->getLoopFor(IBB);
	if (ML && IBB == ML->getHeader())
	continue;
	}

	for (MachineBasicBlock *PBB : I->predecessors()) {
	if (MergePotentials.size() == TailMergeThreshold)
	break;

	if (TriedMerging.count(PBB))
	continue;

	// Skip blocks that loop to themselves, can't tail merge these.
	if (PBB == IBB)
	continue;

	// Visit each predecessor only once.
	if (!UniquePreds.insert(PBB).second)
	continue;

	// Skip blocks which may jump to a landing pad. Can't tail merge these.
	if (PBB->hasEHPadSuccessor())
	continue;

	// After block placement, only consider predecessors that belong to the
	// same loop as IBB. The reason is the same as above when skipping loop
	// header.
	if (AfterBlockPlacement && MLI)
	if (ML != MLI->getLoopFor(PBB))
	continue;

	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
	// Failing case: IBB is the target of a cbr, and we cannot reverse the
	// branch.
	SmallVector<MachineOperand, 4> NewCond(Cond);
	if (!Cond.empty() && TBB == IBB) {
	if (TII->reverseBranchCondition(NewCond))
	continue;
	// This is the QBB case described above
	if (!FBB) {
	auto Next = ++PBB->getIterator();
	if (Next != MF.end())
	FBB = &*Next;
	}
	}

	// Failing case: the only way IBB can be reached from PBB is via
	// exception handling. Happens for landing pads. Would be nice to have
	// a bit in the edge so we didn't have to do all this.
	if (IBB->isEHPad()) {
	MachineFunction::iterator IP = ++PBB->getIterator();
	MachineBasicBlock *PredNextBB = nullptr;
	if (IP != MF.end())
	PredNextBB = &*IP;
	if (!TBB) {
	if (IBB != PredNextBB) // fallthrough
	continue;
	} else if (FBB) {
	if (TBB != IBB && FBB != IBB) // cbr then ubr
	continue;
	} else if (Cond.empty()) {
	if (TBB != IBB) // ubr
	continue;
	} else {
	if (TBB != IBB && IBB != PredNextBB) // cbr
	continue;
	}
	}

	// Remove the unconditional branch at the end, if any.
	if (TBB && (Cond.empty() \|\| FBB)) {
	DebugLoc dl; // FIXME: this is nowhere
	TII->removeBranch(*PBB);
	if (!Cond.empty())
	// reinsert conditional branch only, for now
	TII->insertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
	NewCond, dl);
	}

	MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
	}
	}

	// If this is a large problem, avoid visiting the same basic blocks multiple
	// times.
	if (MergePotentials.size() == TailMergeThreshold)
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	TriedMerging.insert(MergePotentials[i].getBlock());

	if (MergePotentials.size() >= 2)
	MadeChange \|= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength);

	// Reinsert an unconditional branch if needed. The 1 below can occur as a
	// result of removing blocks in TryTailMergeBlocks.
	PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
	if (MergePotentials.size() == 1 &&
	MergePotentials.begin()->getBlock() != PredBB)
	FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
	}

	return MadeChange;
	}

	void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
	SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
	BlockFrequency AccumulatedMBBFreq;

	// Aggregate edge frequency of successor edge j:
	// edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
	// where bb is a basic block that is in SameTails.
	for (const auto &Src : SameTails) {
	const MachineBasicBlock *SrcMBB = Src.getBlock();
	BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
	AccumulatedMBBFreq += BlockFreq;

	// It is not necessary to recompute edge weights if TailBB has less than two
	// successors.
	if (TailMBB.succ_size() <= 1)
	continue;

	auto EdgeFreq = EdgeFreqLs.begin();

	for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
	SuccI != SuccE; ++SuccI, ++EdgeFreq)
	EdgeFreq += BlockFreq MBPI.getEdgeProbability(SrcMBB, *SuccI);
	}

	MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);

	if (TailMBB.succ_size() <= 1)
	return;

	auto SumEdgeFreq =
	std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
	.getFrequency();
	auto EdgeFreq = EdgeFreqLs.begin();

	if (SumEdgeFreq > 0) {
	for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
	SuccI != SuccE; ++SuccI, ++EdgeFreq) {
	auto Prob = BranchProbability::getBranchProbability(
	EdgeFreq->getFrequency(), SumEdgeFreq);
	TailMBB.setSuccProbability(SuccI, Prob);
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// Branch Optimization
	//===----------------------------------------------------------------------===//

	bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
	bool MadeChange = false;

	// Make sure blocks are numbered in order
	MF.RenumberBlocks();
	// Renumbering blocks alters funclet membership, recalculate it.
	FuncletMembership = getFuncletMembership(MF);

	for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
	I != E; ) {
	MachineBasicBlock MBB = &I++;
	MadeChange \|= OptimizeBlock(MBB);

	// If it is dead, remove it.
	if (MBB->pred_empty()) {
	RemoveDeadBlock(MBB);
	MadeChange = true;
	++NumDeadBlocks;
	}
	}

	return MadeChange;
	}

	// Blocks should be considered empty if they contain only debug info;
	// else the debug info would affect codegen.
	static bool IsEmptyBlock(MachineBasicBlock *MBB) {
	return MBB->getFirstNonDebugInstr() == MBB->end();
	}

	// Blocks with only debug info and branches should be considered the same
	// as blocks with only branches.
	static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
	MachineBasicBlock::iterator I = MBB->getFirstNonDebugInstr();
	assert(I != MBB->end() && "empty block!");
	return I->isBranch();
	}

	/// IsBetterFallthrough - Return true if it would be clearly better to
	/// fall-through to MBB1 than to fall through into MBB2. This has to return
	/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
	/// result in infinite loops.
	static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
	MachineBasicBlock *MBB2) {
	// Right now, we use a simple heuristic. If MBB2 ends with a call, and
	// MBB1 doesn't, we prefer to fall through into MBB1. This allows us to
	// optimize branches that branch to either a return block or an assert block
	// into a fallthrough to the return.
	MachineBasicBlock::iterator MBB1I = MBB1->getLastNonDebugInstr();
	MachineBasicBlock::iterator MBB2I = MBB2->getLastNonDebugInstr();
	if (MBB1I == MBB1->end() \|\| MBB2I == MBB2->end())
	return false;

	// If there is a clear successor ordering we make sure that one block
	// will fall through to the next
	if (MBB1->isSuccessor(MBB2)) return true;
	if (MBB2->isSuccessor(MBB1)) return false;

	return MBB2I->isCall() && !MBB1I->isCall();
	}

	/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
	/// instructions on the block.
	static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I != MBB.end() && I->isBranch())
	return I->getDebugLoc();
	return DebugLoc();
	}

	/// OptimizeBlock - Analyze and optimize control flow related to the specified
	/// block. This is never called on the entry block.
	bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
	bool MadeChange = false;
	MachineFunction &MF = *MBB->getParent();
	ReoptimizeBlock:

	MachineFunction::iterator FallThrough = MBB->getIterator();
	++FallThrough;

	// Make sure MBB and FallThrough belong to the same funclet.
	bool SameFunclet = true;
	if (!FuncletMembership.empty() && FallThrough != MF.end()) {
	auto MBBFunclet = FuncletMembership.find(MBB);
	assert(MBBFunclet != FuncletMembership.end());
	auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
	assert(FallThroughFunclet != FuncletMembership.end());
	SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
	}

	// If this block is empty, make everyone use its fall-through, not the block
	// explicitly. Landing pads should not do this since the landing-pad table
	// points to this block. Blocks with their addresses taken shouldn't be
	// optimized away.
	if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
	SameFunclet) {
	// Dead block? Leave for cleanup later.
	if (MBB->pred_empty()) return MadeChange;

	if (FallThrough == MF.end()) {
	// TODO: Simplify preds to not branch here if possible!
	} else if (FallThrough->isEHPad()) {
	// Don't rewrite to a landing pad fallthough. That could lead to the case
	// where a BB jumps to more than one landing pad.
	// TODO: Is it ever worth rewriting predecessors which don't already
	// jump to a landing pad, and so can safely jump to the fallthrough?
	} else if (MBB->isSuccessor(&*FallThrough)) {
	// Rewrite all predecessors of the old block to go to the fallthrough
	// instead.
	while (!MBB->pred_empty()) {
	MachineBasicBlock Pred = (MBB->pred_end()-1);
	Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
	}
	// If MBB was the target of a jump table, update jump tables to go to the
	// fallthrough instead.
	if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
	MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
	MadeChange = true;
	}
	return MadeChange;
	}

	// Check to see if we can simplify the terminator of the block before this
	// one.
	MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));

	MachineBasicBlock PriorTBB = nullptr, PriorFBB = nullptr;
	SmallVector<MachineOperand, 4> PriorCond;
	bool PriorUnAnalyzable =
	TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
	if (!PriorUnAnalyzable) {
	// If the CFG for the prior block has extra edges, remove them.
	MadeChange \|= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
	!PriorCond.empty());

	// If the previous branch is conditional and both conditions go to the same
	// destination, remove the branch, replacing it with an unconditional one or
	// a fall-through.
	if (PriorTBB && PriorTBB == PriorFBB) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	PriorCond.clear();
	if (PriorTBB != MBB)
	TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the previous block unconditionally falls through to this block and
	// this block has no other predecessors, move the contents of this block
	// into the prior block. This doesn't usually happen when SimplifyCFG
	// has been used, but it can happen if tail merging splits a fall-through
	// predecessor of a block.
	// This has to check PrevBB->succ_size() because EH edges are ignored by
	// AnalyzeBranch.
	if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
	PrevBB.succ_size() == 1 &&
	!MBB->hasAddressTaken() && !MBB->isEHPad()) {
	DEBUG(dbgs() << "\nMerging into block: " << PrevBB
	<< "From MBB: " << *MBB);
	// Remove redundant DBG_VALUEs first.
	if (PrevBB.begin() != PrevBB.end()) {
	MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
	--PrevBBIter;
	MachineBasicBlock::iterator MBBIter = MBB->begin();
	// Check if DBG_VALUE at the end of PrevBB is identical to the
	// DBG_VALUE at the beginning of MBB.
	while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
	&& PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
	if (!MBBIter->isIdenticalTo(*PrevBBIter))
	break;
	MachineInstr &DuplicateDbg = *MBBIter;
	++MBBIter; -- PrevBBIter;
	DuplicateDbg.eraseFromParent();
	}
	}
	PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
	PrevBB.removeSuccessor(PrevBB.succ_begin());
	assert(PrevBB.succ_empty());
	PrevBB.transferSuccessors(MBB);
	MadeChange = true;
	return MadeChange;
	}

	// If the previous branch only branches to this block (conditional or
	// not) remove the branch.
	if (PriorTBB == MBB && !PriorFBB) {
	TII->removeBranch(PrevBB);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the prior block branches somewhere else on the condition and here if
	// the condition is false, remove the uncond second branch.
	if (PriorFBB == MBB) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the prior block branches here on true and somewhere else on false, and
	// if the branch condition is reversible, reverse the branch to create a
	// fall-through.
	if (PriorTBB == MBB) {
	SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
	if (!TII->reverseBranchCondition(NewPriorCond)) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}
	}

	// If this block has no successors (e.g. it is a return block or ends with
	// a call to a no-return function like abort or __cxa_throw) and if the pred
	// falls through into this block, and if it would otherwise fall through
	// into the block after this, move this block to the end of the function.
	//
	// We consider it more likely that execution will stay in the function (e.g.
	// due to loops) than it is to exit it. This asserts in loops etc, moving
	// the assert condition out of the loop body.
	if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
	MachineFunction::iterator(PriorTBB) == FallThrough &&
	!MBB->canFallThrough()) {
	bool DoTransform = true;

	// We have to be careful that the succs of PredBB aren't both no-successor
	// blocks. If neither have successors and if PredBB is the second from
	// last block in the function, we'd just keep swapping the two blocks for
	// last. Only do the swap if one is clearly better to fall through than
	// the other.
	if (FallThrough == --MF.end() &&
	!IsBetterFallthrough(PriorTBB, MBB))
	DoTransform = false;

	if (DoTransform) {
	// Reverse the branch so we will fall through on the previous true cond.
	SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
	if (!TII->reverseBranchCondition(NewPriorCond)) {
	DEBUG(dbgs() << "\nMoving MBB: " << *MBB
	<< "To make fallthrough to: " << *PriorTBB << "\n");

	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);

	// Move this block to the end of the function.
	MBB->moveAfter(&MF.back());
	MadeChange = true;
	++NumBranchOpts;
	return MadeChange;
	}
	}
	- }
	- }
	-
	- if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
	- MF.getFunction()->optForSize()) {
	- // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
	- // direction, thereby defeating careful block placement and regressing
	- // performance. Therefore, only consider this for optsize functions.
	- MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
	- if (TII->isUnconditionalTailCall(TailCall)) {
	- MachineBasicBlock Pred = MBB->pred_begin();
	- MachineBasicBlock PredTBB = nullptr, PredFBB = nullptr;
	- SmallVector<MachineOperand, 4> PredCond;
	- bool PredAnalyzable =
	- !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
	-
	- if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) {
	- // The predecessor has a conditional branch to this block which consists
	- // of only a tail call. Try to fold the tail call into the conditional
	- // branch.
	- if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
	- // TODO: It would be nice if analyzeBranch() could provide a pointer
	- // to the branch insturction so replaceBranchWithTailCall() doesn't
	- // have to search for it.
	- TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
	- ++NumTailCalls;
	- Pred->removeSuccessor(MBB);
	- MadeChange = true;
	- return MadeChange;
	- }
	- }
	- // If the predecessor is falling through to this block, we could reverse
	- // the branch condition and fold the tail call into that. However, after
	- // that we might have to re-arrange the CFG to fall through to the other
	- // block and there is a high risk of regressing code size rather than
	- // improving it.
	}
	}

	// Analyze the branch in the current block.
	MachineBasicBlock CurTBB = nullptr, CurFBB = nullptr;
	SmallVector<MachineOperand, 4> CurCond;
	bool CurUnAnalyzable =
	TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
	if (!CurUnAnalyzable) {
	// If the CFG for the prior block has extra edges, remove them.
	MadeChange \|= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());

	// If this is a two-way branch, and the FBB branches to this block, reverse
	// the condition so the single-basic-block loop is faster. Instead of:
	// Loop: xxx; jcc Out; jmp Loop
	// we want:
	// Loop: xxx; jncc Loop; jmp Out
	if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
	SmallVector<MachineOperand, 4> NewCond(CurCond);
	if (!TII->reverseBranchCondition(NewCond)) {
	DebugLoc dl = getBranchDebugLoc(*MBB);
	TII->removeBranch(*MBB);
	TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}
	}

	// If this branch is the only thing in its block, see if we can forward
	// other blocks across it.
	if (CurTBB && CurCond.empty() && !CurFBB &&
	IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
	!MBB->hasAddressTaken() && !MBB->isEHPad()) {
	DebugLoc dl = getBranchDebugLoc(*MBB);
	// This block may contain just an unconditional branch. Because there can
	// be 'non-branch terminators' in the block, try removing the branch and
	// then seeing if the block is empty.
	TII->removeBranch(*MBB);
	// If the only things remaining in the block are debug info, remove these
	// as well, so this will behave the same as an empty block in non-debug
	// mode.
	if (IsEmptyBlock(MBB)) {
	// Make the block empty, losing the debug info (we could probably
	// improve this in some cases.)
	MBB->erase(MBB->begin(), MBB->end());
	}
	// If this block is just an unconditional branch to CurTBB, we can
	// usually completely eliminate the block. The only case we cannot
	// completely eliminate the block is when the block before this one
	// falls through into MBB and we can't understand the prior block's branch
	// condition.
	if (MBB->empty()) {
	bool PredHasNoFallThrough = !PrevBB.canFallThrough();
	if (PredHasNoFallThrough \|\| !PriorUnAnalyzable \|\|
	!PrevBB.isSuccessor(MBB)) {
	// If the prior block falls through into us, turn it into an
	// explicit branch to us to make updates simpler.
	if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
	PriorTBB != MBB && PriorFBB != MBB) {
	if (!PriorTBB) {
	assert(PriorCond.empty() && !PriorFBB &&
	"Bad branch analysis");
	PriorTBB = MBB;
	} else {
	assert(!PriorFBB && "Machine CFG out of date!");
	PriorFBB = MBB;
	}
	DebugLoc pdl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
	}

	// Iterate through all the predecessors, revectoring each in-turn.
	size_t PI = 0;
	bool DidChange = false;
	bool HasBranchToSelf = false;
	while(PI != MBB->pred_size()) {
	MachineBasicBlock PMBB = (MBB->pred_begin() + PI);
	if (PMBB == MBB) {
	// If this block has an uncond branch to itself, leave it.
	++PI;
	HasBranchToSelf = true;
	} else {
	DidChange = true;
	PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
	// If this change resulted in PMBB ending in a conditional
	// branch where both conditions go to the same destination,
	// change this to an unconditional branch (and fix the CFG).
	MachineBasicBlock NewCurTBB = nullptr, NewCurFBB = nullptr;
	SmallVector<MachineOperand, 4> NewCurCond;
	bool NewCurUnAnalyzable = TII->analyzeBranch(
	*PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
	if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
	DebugLoc pdl = getBranchDebugLoc(*PMBB);
	TII->removeBranch(*PMBB);
	NewCurCond.clear();
	TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
	MadeChange = true;
	++NumBranchOpts;
	PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
	}
	}
	}

	// Change any jumptables to go to the new MBB.
	if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
	MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
	if (DidChange) {
	++NumBranchOpts;
	MadeChange = true;
	if (!HasBranchToSelf) return MadeChange;
	}
	}
	}

	// Add the branch back if the block is more than just an uncond branch.
	TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
	}
	}

	// If the prior block doesn't fall through into this block, and if this
	// block doesn't fall through into some other block, see if we can find a
	// place to move this block where a fall-through will happen.
	if (!PrevBB.canFallThrough()) {

	// Now we know that there was no fall-through into this block, check to
	// see if it has a fall-through into its successor.
	bool CurFallsThru = MBB->canFallThrough();

	if (!MBB->isEHPad()) {
	// Check all the predecessors of this block. If one of them has no fall
	// throughs, move this block right after it.
	for (MachineBasicBlock *PredBB : MBB->predecessors()) {
	// Analyze the branch at the end of the pred.
	MachineBasicBlock PredTBB = nullptr, PredFBB = nullptr;
	SmallVector<MachineOperand, 4> PredCond;
	if (PredBB != MBB && !PredBB->canFallThrough() &&
	!TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
	(!CurFallsThru \|\| !CurTBB \|\| !CurFBB) &&
	(!CurFallsThru \|\| MBB->getNumber() >= PredBB->getNumber())) {
	// If the current block doesn't fall through, just move it.
	// If the current block can fall through and does not end with a
	// conditional branch, we need to append an unconditional jump to
	// the (current) next block. To avoid a possible compile-time
	// infinite loop, move blocks only backward in this case.
	// Also, if there are already 2 branches here, we cannot add a third;
	// this means we have the case
	// Bcc next
	// B elsewhere
	// next:
	if (CurFallsThru) {
	MachineBasicBlock NextBB = &std::next(MBB->getIterator());
	CurCond.clear();
	TII->insertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
	}
	MBB->moveAfter(PredBB);
	MadeChange = true;
	goto ReoptimizeBlock;
	}
	}
	}

	if (!CurFallsThru) {
	// Check all successors to see if we can move this block before it.
	for (MachineBasicBlock *SuccBB : MBB->successors()) {
	// Analyze the branch at the end of the block before the succ.
	MachineFunction::iterator SuccPrev = --SuccBB->getIterator();

	// If this block doesn't already fall-through to that successor, and if
	// the succ doesn't already have a block that can fall through into it,
	// and if the successor isn't an EH destination, we can arrange for the
	// fallthrough to happen.
	if (SuccBB != MBB && &*SuccPrev != MBB &&
	!SuccPrev->canFallThrough() && !CurUnAnalyzable &&
	!SuccBB->isEHPad()) {
	MBB->moveBefore(SuccBB);
	MadeChange = true;
	goto ReoptimizeBlock;
	}
	}

	// Okay, there is no really great place to put this block. If, however,
	// the block before this one would be a fall-through if this block were
	// removed, move this block to the end of the function. There is no real
	// advantage in "falling through" to an EH block, so we don't want to
	// perform this transformation for that case.
	//
	// Also, Windows EH introduced the possibility of an arbitrary number of
	// successors to a given block. The analyzeBranch call does not consider
	// exception handling and so we can get in a state where a block
	// containing a call is followed by multiple EH blocks that would be
	// rotated infinitely at the end of the function if the transformation
	// below were performed for EH "FallThrough" blocks. Therefore, even if
	// that appears not to be happening anymore, we should assume that it is
	// possible and not remove the "!FallThrough()->isEHPad" condition below.
	MachineBasicBlock PrevTBB = nullptr, PrevFBB = nullptr;
	SmallVector<MachineOperand, 4> PrevCond;
	if (FallThrough != MF.end() &&
	!FallThrough->isEHPad() &&
	!TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
	PrevBB.isSuccessor(&*FallThrough)) {
	MBB->moveAfter(&MF.back());
	MadeChange = true;
	return MadeChange;
	}
	}
	}

	return MadeChange;
	}

	//===----------------------------------------------------------------------===//
	// Hoist Common Code
	//===----------------------------------------------------------------------===//

	/// HoistCommonCode - Hoist common instruction sequences at the start of basic
	/// blocks to their common predecessor.
	bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
	bool MadeChange = false;
	for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
	MachineBasicBlock MBB = &I++;
	MadeChange \|= HoistCommonCodeInSuccs(MBB);
	}

	return MadeChange;
	}

	/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
	/// its 'true' successor.
	static MachineBasicBlock findFalseBlock(MachineBasicBlock BB,
	MachineBasicBlock *TrueBB) {
	for (MachineBasicBlock *SuccBB : BB->successors())
	if (SuccBB != TrueBB)
	return SuccBB;
	return nullptr;
	}

	template <class Container>
	static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
	Container &Set) {
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	Set.insert(*AI);
	} else {
	Set.insert(Reg);
	}
	}

	/// findHoistingInsertPosAndDeps - Find the location to move common instructions
	/// in successors to. The location is usually just before the terminator,
	/// however if the terminator is a conditional branch and its previous
	/// instruction is the flag setting instruction, the previous instruction is
	/// the preferred location. This function also gathers uses and defs of the
	/// instructions from the insertion point to the end of the block. The data is
	/// used by HoistCommonCodeInSuccs to ensure safety.
	static
	MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI,
	SmallSet<unsigned,4> &Uses,
	SmallSet<unsigned,4> &Defs) {
	MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
	if (!TII->isUnpredicatedTerminator(*Loc))
	return MBB->end();

	for (const MachineOperand &MO : Loc->operands()) {
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isUse()) {
	addRegAndItsAliases(Reg, TRI, Uses);
	} else {
	if (!MO.isDead())
	// Don't try to hoist code in the rare case the terminator defines a
	// register that is later used.
	return MBB->end();

	// If the terminator defines a register, make sure we don't hoist
	// the instruction whose def might be clobbered by the terminator.
	addRegAndItsAliases(Reg, TRI, Defs);
	}
	}

	if (Uses.empty())
	return Loc;
	if (Loc == MBB->begin())
	return MBB->end();

	// The terminator is probably a conditional branch, try not to separate the
	// branch from condition setting instruction.
	MachineBasicBlock::iterator PI =
	skipDebugInstructionsBackward(std::prev(Loc), MBB->begin());

	bool IsDef = false;
	for (const MachineOperand &MO : PI->operands()) {
	// If PI has a regmask operand, it is probably a call. Separate away.
	if (MO.isRegMask())
	return Loc;
	if (!MO.isReg() \|\| MO.isUse())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (Uses.count(Reg)) {
	IsDef = true;
	break;
	}
	}
	if (!IsDef)
	// The condition setting instruction is not just before the conditional
	// branch.
	return Loc;

	// Be conservative, don't insert instruction above something that may have
	// side-effects. And since it's potentially bad to separate flag setting
	// instruction from the conditional branch, just abort the optimization
	// completely.
	// Also avoid moving code above predicated instruction since it's hard to
	// reason about register liveness with predicated instruction.
	bool DontMoveAcrossStore = true;
	if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) \|\| TII->isPredicated(*PI))
	return MBB->end();


	// Find out what registers are live. Note this routine is ignoring other live
	// registers which are only used by instructions in successor blocks.
	for (const MachineOperand &MO : PI->operands()) {
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isUse()) {
	addRegAndItsAliases(Reg, TRI, Uses);
	} else {
	if (Uses.erase(Reg)) {
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
	Uses.erase(*SubRegs); // Use sub-registers to be conservative
	}
	}
	addRegAndItsAliases(Reg, TRI, Defs);
	}
	}

	return PI;
	}

	/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
	/// sequence at the start of the function, move the instructions before MBB
	/// terminator if it's legal.
	bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) \|\| !TBB \|\| Cond.empty())
	return false;

	if (!FBB) FBB = findFalseBlock(MBB, TBB);
	if (!FBB)
	// Malformed bcc? True and false blocks are the same?
	return false;

	// Restrict the optimization to cases where MBB is the only predecessor,
	// it is an obvious win.
	if (TBB->pred_size() > 1 \|\| FBB->pred_size() > 1)
	return false;

	// Find a suitable position to hoist the common instructions to. Also figure
	// out which registers are used or defined by instructions from the insertion
	// point to the end of the block.
	SmallSet<unsigned, 4> Uses, Defs;
	MachineBasicBlock::iterator Loc =
	findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
	if (Loc == MBB->end())
	return false;

	bool HasDups = false;
	SmallVector<unsigned, 4> LocalDefs;
	SmallSet<unsigned, 4> LocalDefsSet;
	MachineBasicBlock::iterator TIB = TBB->begin();
	MachineBasicBlock::iterator FIB = FBB->begin();
	MachineBasicBlock::iterator TIE = TBB->end();
	MachineBasicBlock::iterator FIE = FBB->end();
	while (TIB != TIE && FIB != FIE) {
	// Skip dbg_value instructions. These do not count.
	TIB = skipDebugInstructionsForward(TIB, TIE);
	FIB = skipDebugInstructionsForward(FIB, FIE);
	if (TIB == TIE \|\| FIB == FIE)
	break;

	if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
	break;

	if (TII->isPredicated(*TIB))
	// Hard to reason about register liveness with predicated instruction.
	break;

	bool IsSafe = true;
	for (MachineOperand &MO : TIB->operands()) {
	// Don't attempt to hoist instructions with register masks.
	if (MO.isRegMask()) {
	IsSafe = false;
	break;
	}
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isDef()) {
	if (Uses.count(Reg)) {
	// Avoid clobbering a register that's used by the instruction at
	// the point of insertion.
	IsSafe = false;
	break;
	}

	if (Defs.count(Reg) && !MO.isDead()) {
	// Don't hoist the instruction if the def would be clobber by the
	// instruction at the point insertion. FIXME: This is overly
	// conservative. It should be possible to hoist the instructions
	// in BB2 in the following example:
	// BB1:
	// r1, eflag = op1 r2, r3
	// brcc eflag
	//
	// BB2:
	// r1 = op2, ...
	// = op3, r1<kill>
	IsSafe = false;
	break;
	}
	} else if (!LocalDefsSet.count(Reg)) {
	if (Defs.count(Reg)) {
	// Use is defined by the instruction at the point of insertion.
	IsSafe = false;
	break;
	}

	if (MO.isKill() && Uses.count(Reg))
	// Kills a register that's read by the instruction at the point of
	// insertion. Remove the kill marker.
	MO.setIsKill(false);
	}
	}
	if (!IsSafe)
	break;

	bool DontMoveAcrossStore = true;
	if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
	break;

	// Remove kills from LocalDefsSet, these registers had short live ranges.
	for (const MachineOperand &MO : TIB->operands()) {
	if (!MO.isReg() \|\| !MO.isUse() \|\| !MO.isKill())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg \|\| !LocalDefsSet.count(Reg))
	continue;
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	LocalDefsSet.erase(*AI);
	} else {
	LocalDefsSet.erase(Reg);
	}
	}

	// Track local defs so we can update liveins.
	for (const MachineOperand &MO : TIB->operands()) {
	if (!MO.isReg() \|\| !MO.isDef() \|\| MO.isDead())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg \|\| TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	LocalDefs.push_back(Reg);
	addRegAndItsAliases(Reg, TRI, LocalDefsSet);
	}

	HasDups = true;
	++TIB;
	++FIB;
	}

	if (!HasDups)
	return false;

	MBB->splice(Loc, TBB, TBB->begin(), TIB);
	FBB->erase(FBB->begin(), FIB);

	// Update livein's.
	bool AddedLiveIns = false;
	for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
	unsigned Def = LocalDefs[i];
	if (LocalDefsSet.count(Def)) {
	TBB->addLiveIn(Def);
	FBB->addLiveIn(Def);
	AddedLiveIns = true;
	}
	}

	if (AddedLiveIns) {
	TBB->sortUniqueLiveIns();
	FBB->sortUniqueLiveIns();
	}

	++NumHoist;
	return true;
	}
	Index: projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp (revision 313643)
	@@ -1,371 +1,385 @@
	//===- MachineCopyPropagation.cpp - Machine Copy Propagation Pass ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This is an extremely simple MachineInstr-level copy propagation pass.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/Passes.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	using namespace llvm;

	#define DEBUG_TYPE "codegen-cp"

	STATISTIC(NumDeletes, "Number of dead copies deleted");

	namespace {
	typedef SmallVector<unsigned, 4> RegList;
	typedef DenseMap<unsigned, RegList> SourceMap;
	typedef DenseMap<unsigned, MachineInstr*> Reg2MIMap;

	class MachineCopyPropagation : public MachineFunctionPass {
	const TargetRegisterInfo *TRI;
	const TargetInstrInfo *TII;
	const MachineRegisterInfo *MRI;

	public:
	static char ID; // Pass identification, replacement for typeid
	MachineCopyPropagation() : MachineFunctionPass(ID) {
	initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	bool runOnMachineFunction(MachineFunction &MF) override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	private:
	void ClobberRegister(unsigned Reg);
	+ void ReadRegister(unsigned Reg);
	void CopyPropagateBlock(MachineBasicBlock &MBB);
	bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);

	/// Candidates for deletion.
	SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
	/// Def -> available copies map.
	Reg2MIMap AvailCopyMap;
	/// Def -> copies map.
	Reg2MIMap CopyMap;
	/// Src -> Def map
	SourceMap SrcMap;
	bool Changed;
	};
	}
	char MachineCopyPropagation::ID = 0;
	char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;

	INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
	"Machine Copy Propagation Pass", false, false)

	/// Remove any entry in \p Map where the register is a subregister or equal to
	/// a register contained in \p Regs.
	static void removeRegsFromMap(Reg2MIMap &Map, const RegList &Regs,
	const TargetRegisterInfo &TRI) {
	for (unsigned Reg : Regs) {
	// Source of copy is no longer available for propagation.
	for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
	Map.erase(*SR);
	}
	}

	/// Remove any entry in \p Map that is marked clobbered in \p RegMask.
	/// The map will typically have a lot fewer entries than the regmask clobbers,
	/// so this is more efficient than iterating the clobbered registers and calling
	/// ClobberRegister() on them.
	static void removeClobberedRegsFromMap(Reg2MIMap &Map,
	const MachineOperand &RegMask) {
	for (Reg2MIMap::iterator I = Map.begin(), E = Map.end(), Next; I != E;
	I = Next) {
	Next = std::next(I);
	unsigned Reg = I->first;
	if (RegMask.clobbersPhysReg(Reg))
	Map.erase(I);
	}
	}

	void MachineCopyPropagation::ClobberRegister(unsigned Reg) {
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
	CopyMap.erase(*AI);
	AvailCopyMap.erase(*AI);

	SourceMap::iterator SI = SrcMap.find(*AI);
	if (SI != SrcMap.end()) {
	removeRegsFromMap(AvailCopyMap, SI->second, *TRI);
	SrcMap.erase(SI);
	}
	}
	}

	+void MachineCopyPropagation::ReadRegister(unsigned Reg) {
	+ // If 'Reg' is defined by a copy, the copy is no longer a candidate
	+ // for elimination.
	+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
	+ Reg2MIMap::iterator CI = CopyMap.find(*AI);
	+ if (CI != CopyMap.end()) {
	+ DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
	+ MaybeDeadCopies.remove(CI->second);
	+ }
	+ }
	+}
	+
	/// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
	/// This fact may have been obscured by sub register usage or may not be true at
	/// all even though Src and Def are subregisters of the registers used in
	/// PreviousCopy. e.g.
	/// isNopCopy("ecx = COPY eax", AX, CX) == true
	/// isNopCopy("ecx = COPY eax", AH, CL) == false
	static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
	unsigned Def, const TargetRegisterInfo *TRI) {
	unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
	unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
	if (Src == PreviousSrc) {
	assert(Def == PreviousDef);
	return true;
	}
	if (!TRI->isSubRegister(PreviousSrc, Src))
	return false;
	unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
	return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
	}

	/// Remove instruction \p Copy if there exists a previous copy that copies the
	/// register \p Src to the register \p Def; This may happen indirectly by
	/// copying the super registers.
	bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
	unsigned Def) {
	// Avoid eliminating a copy from/to a reserved registers as we cannot predict
	// the value (Example: The sparc zero register is writable but stays zero).
	if (MRI->isReserved(Src) \|\| MRI->isReserved(Def))
	return false;

	// Search for an existing copy.
	Reg2MIMap::iterator CI = AvailCopyMap.find(Def);
	if (CI == AvailCopyMap.end())
	return false;

	// Check that the existing copy uses the correct sub registers.
	MachineInstr &PrevCopy = *CI->second;
	if (!isNopCopy(PrevCopy, Src, Def, TRI))
	return false;

	DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());

	// Copy was redundantly redefining either Src or Def. Remove earlier kill
	// flags between Copy and PrevCopy because the value will be reused now.
	assert(Copy.isCopy());
	unsigned CopyDef = Copy.getOperand(0).getReg();
	assert(CopyDef == Src \|\| CopyDef == Def);
	for (MachineInstr &MI :
	make_range(PrevCopy.getIterator(), Copy.getIterator()))
	MI.clearRegisterKills(CopyDef, TRI);

	Copy.eraseFromParent();
	Changed = true;
	++NumDeletes;
	return true;
	}

	void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
	DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");

	for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
	MachineInstr MI = &I;
	++I;

	if (MI->isCopy()) {
	unsigned Def = MI->getOperand(0).getReg();
	unsigned Src = MI->getOperand(1).getReg();

	assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
	!TargetRegisterInfo::isVirtualRegister(Src) &&
	"MachineCopyPropagation should be run after register allocation!");

	// The two copies cancel out and the source of the first copy
	// hasn't been overridden, eliminate the second one. e.g.
	// %ECX<def> = COPY %EAX
	// ... nothing clobbered EAX.
	// %EAX<def> = COPY %ECX
	// =>
	// %ECX<def> = COPY %EAX
	//
	// or
	//
	// %ECX<def> = COPY %EAX
	// ... nothing clobbered EAX.
	// %ECX<def> = COPY %EAX
	// =>
	// %ECX<def> = COPY %EAX
	if (eraseIfRedundant(MI, Def, Src) \|\| eraseIfRedundant(MI, Src, Def))
	continue;

	// If Src is defined by a previous copy, the previous copy cannot be
	// eliminated.
	- for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
	- Reg2MIMap::iterator CI = CopyMap.find(*AI);
	- if (CI != CopyMap.end()) {
	- DEBUG(dbgs() << "MCP: Copy is no longer dead: "; CI->second->dump());
	- MaybeDeadCopies.remove(CI->second);
	- }
	+ ReadRegister(Src);
	+ for (const MachineOperand &MO : MI->implicit_operands()) {
	+ if (!MO.isReg() \|\| !MO.readsReg())
	+ continue;
	+ unsigned Reg = MO.getReg();
	+ if (!Reg)
	+ continue;
	+ ReadRegister(Reg);
	}

	DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());

	// Copy is now a candidate for deletion.
	if (!MRI->isReserved(Def))
	MaybeDeadCopies.insert(MI);

	// If 'Def' is previously source of another copy, then this earlier copy's
	// source is no longer available. e.g.
	// %xmm9<def> = copy %xmm2
	// ...
	// %xmm2<def> = copy %xmm0
	// ...
	// %xmm2<def> = copy %xmm9
	ClobberRegister(Def);
	+ for (const MachineOperand &MO : MI->implicit_operands()) {
	+ if (!MO.isReg() \|\| !MO.isDef())
	+ continue;
	+ unsigned Reg = MO.getReg();
	+ if (!Reg)
	+ continue;
	+ ClobberRegister(Reg);
	+ }

	// Remember Def is defined by the copy.
	for (MCSubRegIterator SR(Def, TRI, /IncludeSelf=/true); SR.isValid();
	++SR) {
	CopyMap[*SR] = MI;
	AvailCopyMap[*SR] = MI;
	}

	// Remember source that's copied to Def. Once it's clobbered, then
	// it's no longer available for copy propagation.
	RegList &DestList = SrcMap[Src];
	if (!is_contained(DestList, Def))
	DestList.push_back(Def);

	continue;
	}

	// Not a copy.
	SmallVector<unsigned, 2> Defs;
	const MachineOperand *RegMask = nullptr;
	for (const MachineOperand &MO : MI->operands()) {
	if (MO.isRegMask())
	RegMask = &MO;
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;

	assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
	"MachineCopyPropagation should be run after register allocation!");

	if (MO.isDef()) {
	Defs.push_back(Reg);
	- continue;
	- }
	-
	- // If 'Reg' is defined by a copy, the copy is no longer a candidate
	- // for elimination.
	- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
	- Reg2MIMap::iterator CI = CopyMap.find(*AI);
	- if (CI != CopyMap.end()) {
	- DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
	- MaybeDeadCopies.remove(CI->second);
	- }
	+ } else {
	+ ReadRegister(Reg);
	}
	// Treat undef use like defs for copy propagation but not for
	// dead copy. We would need to do a liveness check to be sure the copy
	// is dead for undef uses.
	// The backends are allowed to do whatever they want with undef value
	// and we cannot be sure this register will not be rewritten to break
	// some false dependencies for the hardware for instance.
	if (MO.isUndef())
	Defs.push_back(Reg);
	}

	// The instruction has a register mask operand which means that it clobbers
	// a large set of registers. Treat clobbered registers the same way as
	// defined registers.
	if (RegMask) {
	// Erase any MaybeDeadCopies whose destination register is clobbered.
	for (SmallSetVector<MachineInstr *, 8>::iterator DI =
	MaybeDeadCopies.begin();
	DI != MaybeDeadCopies.end();) {
	MachineInstr MaybeDead = DI;
	unsigned Reg = MaybeDead->getOperand(0).getReg();
	assert(!MRI->isReserved(Reg));

	if (!RegMask->clobbersPhysReg(Reg)) {
	++DI;
	continue;
	}

	DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
	MaybeDead->dump());

	// erase() will return the next valid iterator pointing to the next
	// element after the erased one.
	DI = MaybeDeadCopies.erase(DI);
	MaybeDead->eraseFromParent();
	Changed = true;
	++NumDeletes;
	}

	removeClobberedRegsFromMap(AvailCopyMap, *RegMask);
	removeClobberedRegsFromMap(CopyMap, *RegMask);
	for (SourceMap::iterator I = SrcMap.begin(), E = SrcMap.end(), Next;
	I != E; I = Next) {
	Next = std::next(I);
	if (RegMask->clobbersPhysReg(I->first)) {
	removeRegsFromMap(AvailCopyMap, I->second, *TRI);
	SrcMap.erase(I);
	}
	}
	}

	// Any previous copy definition or reading the Defs is no longer available.
	for (unsigned Reg : Defs)
	ClobberRegister(Reg);
	}

	// If MBB doesn't have successors, delete the copies whose defs are not used.
	// If MBB does have successors, then conservative assume the defs are live-out
	// since we don't want to trust live-in lists.
	if (MBB.succ_empty()) {
	for (MachineInstr *MaybeDead : MaybeDeadCopies) {
	assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
	MaybeDead->eraseFromParent();
	Changed = true;
	++NumDeletes;
	}
	}

	MaybeDeadCopies.clear();
	AvailCopyMap.clear();
	CopyMap.clear();
	SrcMap.clear();
	}

	bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(*MF.getFunction()))
	return false;

	Changed = false;

	TRI = MF.getSubtarget().getRegisterInfo();
	TII = MF.getSubtarget().getInstrInfo();
	MRI = &MF.getRegInfo();

	for (MachineBasicBlock &MBB : MF)
	CopyPropagateBlock(MBB);

	return Changed;
	}

	Index: projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp (revision 313643)
	@@ -1,3186 +1,3206 @@
	//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the generic RegisterCoalescer interface which
	// is used as the common interface used by all clients and
	// implementations of register coalescing.
	//
	//===----------------------------------------------------------------------===//

	#include "RegisterCoalescer.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/CodeGen/LiveIntervalAnalysis.h"
	#include "llvm/CodeGen/LiveRangeEdit.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/RegisterClassInfo.h"
	#include "llvm/CodeGen/VirtRegMap.h"
	#include "llvm/IR/Value.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	#include <cmath>
	using namespace llvm;

	#define DEBUG_TYPE "regalloc"

	STATISTIC(numJoins , "Number of interval joins performed");
	STATISTIC(numCrossRCs , "Number of cross class joins performed");
	STATISTIC(numCommutes , "Number of instruction commuting performed");
	STATISTIC(numExtends , "Number of copies extended");
	STATISTIC(NumReMats , "Number of instructions re-materialized");
	STATISTIC(NumInflated , "Number of register classes inflated");
	STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
	STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved");

	static cl::opt<bool>
	EnableJoining("join-liveintervals",
	cl::desc("Coalesce copies (default=true)"),
	cl::init(true));

	static cl::opt<bool> UseTerminalRule("terminal-rule",
	cl::desc("Apply the terminal rule"),
	cl::init(false), cl::Hidden);

	/// Temporary flag to test critical edge unsplitting.
	static cl::opt<bool>
	EnableJoinSplits("join-splitedges",
	cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);

	/// Temporary flag to test global copy optimization.
	static cl::opt<cl::boolOrDefault>
	EnableGlobalCopies("join-globalcopies",
	cl::desc("Coalesce copies that span blocks (default=subtarget)"),
	cl::init(cl::BOU_UNSET), cl::Hidden);

	static cl::opt<bool>
	VerifyCoalescing("verify-coalescing",
	cl::desc("Verify machine instrs before and after register coalescing"),
	cl::Hidden);

	namespace {
	class RegisterCoalescer : public MachineFunctionPass,
	private LiveRangeEdit::Delegate {
	MachineFunction* MF;
	MachineRegisterInfo* MRI;
	const TargetMachine* TM;
	const TargetRegisterInfo* TRI;
	const TargetInstrInfo* TII;
	LiveIntervals *LIS;
	const MachineLoopInfo* Loops;
	AliasAnalysis *AA;
	RegisterClassInfo RegClassInfo;

	/// A LaneMask to remember on which subregister live ranges we need to call
	/// shrinkToUses() later.
	LaneBitmask ShrinkMask;

	/// True if the main range of the currently coalesced intervals should be
	/// checked for smaller live intervals.
	bool ShrinkMainRange;

	/// \brief True if the coalescer should aggressively coalesce global copies
	/// in favor of keeping local copies.
	bool JoinGlobalCopies;

	/// \brief True if the coalescer should aggressively coalesce fall-thru
	/// blocks exclusively containing copies.
	bool JoinSplitEdges;

	/// Copy instructions yet to be coalesced.
	SmallVector<MachineInstr*, 8> WorkList;
	SmallVector<MachineInstr*, 8> LocalWorkList;

	/// Set of instruction pointers that have been erased, and
	/// that may be present in WorkList.
	SmallPtrSet<MachineInstr*, 8> ErasedInstrs;

	/// Dead instructions that are about to be deleted.
	SmallVector<MachineInstr*, 8> DeadDefs;

	/// Virtual registers to be considered for register class inflation.
	SmallVector<unsigned, 8> InflateRegs;

	/// Recursively eliminate dead defs in DeadDefs.
	void eliminateDeadDefs();

	/// LiveRangeEdit callback for eliminateDeadDefs().
	void LRE_WillEraseInstruction(MachineInstr *MI) override;

	/// Coalesce the LocalWorkList.
	void coalesceLocals();

	/// Join compatible live intervals
	void joinAllIntervals();

	/// Coalesce copies in the specified MBB, putting
	/// copies that cannot yet be coalesced into WorkList.
	void copyCoalesceInMBB(MachineBasicBlock *MBB);

	/// Tries to coalesce all copies in CurrList. Returns true if any progress
	/// was made.
	bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);

	/// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
	/// src/dst of the copy instruction CopyMI. This returns true if the copy
	/// was successfully coalesced away. If it is not currently possible to
	/// coalesce this interval, but it may be possible if other things get
	/// coalesced, then it returns true by reference in 'Again'.
	bool joinCopy(MachineInstr *TheCopy, bool &Again);

	/// Attempt to join these two intervals. On failure, this
	/// returns false. The output "SrcInt" will not have been modified, so we
	/// can use this information below to update aliases.
	bool joinIntervals(CoalescerPair &CP);

	/// Attempt joining two virtual registers. Return true on success.
	bool joinVirtRegs(CoalescerPair &CP);

	/// Attempt joining with a reserved physreg.
	bool joinReservedPhysReg(CoalescerPair &CP);

	/// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
	/// Subranges in @p LI which only partially interfere with the desired
	/// LaneMask are split as necessary. @p LaneMask are the lanes that
	/// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
	/// lanemasks already adjusted to the coalesced register.
	void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
	LaneBitmask LaneMask, CoalescerPair &CP);

	/// Join the liveranges of two subregisters. Joins @p RRange into
	/// @p LRange, @p RRange may be invalid afterwards.
	void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
	LaneBitmask LaneMask, const CoalescerPair &CP);

	/// We found a non-trivially-coalescable copy. If the source value number is
	/// defined by a copy from the destination reg see if we can merge these two
	/// destination reg valno# into a single value number, eliminating a copy.
	/// This returns true if an interval was modified.
	bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);

	/// Return true if there are definitions of IntB
	/// other than BValNo val# that can reach uses of AValno val# of IntA.
	bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
	VNInfo AValNo, VNInfo BValNo);

	/// We found a non-trivially-coalescable copy.
	/// If the source value number is defined by a commutable instruction and
	/// its other operand is coalesced to the copy dest register, see if we
	/// can transform the copy into a noop by commuting the definition.
	/// This returns true if an interval was modified.
	bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);

	/// If the source of a copy is defined by a
	/// trivial computation, replace the copy by rematerialize the definition.
	bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
	bool &IsDefCopy);

	/// Return true if a copy involving a physreg should be joined.
	bool canJoinPhys(const CoalescerPair &CP);

	/// Replace all defs and uses of SrcReg to DstReg and update the subregister
	/// number if it is not zero. If DstReg is a physical register and the
	/// existing subregister number of the def / use being updated is not zero,
	/// make sure to set it to the correct physical subregister.
	void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);

	/// If the given machine operand reads only undefined lanes add an undef
	/// flag.
	/// This can happen when undef uses were previously concealed by a copy
	/// which we coalesced. Example:
	/// %vreg0:sub0<def,read-undef> = ...
	/// %vreg1 = COPY %vreg0 <-- Coalescing COPY reveals undef
	/// = use %vreg1:sub1 <-- hidden undef use
	void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
	MachineOperand &MO, unsigned SubRegIdx);

	/// Handle copies of undef values.
	/// Returns true if @p CopyMI was a copy of an undef value and eliminated.
	bool eliminateUndefCopy(MachineInstr *CopyMI);

	/// Check whether or not we should apply the terminal rule on the
	/// destination (Dst) of \p Copy.
	/// When the terminal rule applies, Copy is not profitable to
	/// coalesce.
	/// Dst is terminal if it has exactly one affinity (Dst, Src) and
	/// at least one interference (Dst, Dst2). If Dst is terminal, the
	/// terminal rule consists in checking that at least one of
	/// interfering node, say Dst2, has an affinity of equal or greater
	/// weight with Src.
	/// In that case, Dst2 and Dst will not be able to be both coalesced
	/// with Src. Since Dst2 exposes more coalescing opportunities than
	/// Dst, we can drop \p Copy.
	bool applyTerminalRule(const MachineInstr &Copy) const;

	/// Wrapper method for \see LiveIntervals::shrinkToUses.
	/// This method does the proper fixing of the live-ranges when the afore
	/// mentioned method returns true.
	void shrinkToUses(LiveInterval *LI,
	SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
	if (LIS->shrinkToUses(LI, Dead)) {
	/// Check whether or not \p LI is composed by multiple connected
	/// components and if that is the case, fix that.
	SmallVector<LiveInterval*, 8> SplitLIs;
	LIS->splitSeparateComponents(*LI, SplitLIs);
	}
	}

	public:
	static char ID; ///< Class identification, replacement for typeinfo
	RegisterCoalescer() : MachineFunctionPass(ID) {
	initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override;

	void releaseMemory() override;

	/// This is the pass entry point.
	bool runOnMachineFunction(MachineFunction&) override;

	/// Implement the dump method.
	void print(raw_ostream &O, const Module* = nullptr) const override;
	};
	} // end anonymous namespace

	char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;

	INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
	"Simple Register Coalescing", false, false)
	INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
	INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
	INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
	"Simple Register Coalescing", false, false)

	char RegisterCoalescer::ID = 0;

	static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
	unsigned &Src, unsigned &Dst,
	unsigned &SrcSub, unsigned &DstSub) {
	if (MI->isCopy()) {
	Dst = MI->getOperand(0).getReg();
	DstSub = MI->getOperand(0).getSubReg();
	Src = MI->getOperand(1).getReg();
	SrcSub = MI->getOperand(1).getSubReg();
	} else if (MI->isSubregToReg()) {
	Dst = MI->getOperand(0).getReg();
	DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
	MI->getOperand(3).getImm());
	Src = MI->getOperand(2).getReg();
	SrcSub = MI->getOperand(2).getSubReg();
	} else
	return false;
	return true;
	}

	/// Return true if this block should be vacated by the coalescer to eliminate
	/// branches. The important cases to handle in the coalescer are critical edges
	/// split during phi elimination which contain only copies. Simple blocks that
	/// contain non-branches should also be vacated, but this can be handled by an
	/// earlier pass similar to early if-conversion.
	static bool isSplitEdge(const MachineBasicBlock *MBB) {
	if (MBB->pred_size() != 1 \|\| MBB->succ_size() != 1)
	return false;

	for (const auto &MI : *MBB) {
	if (!MI.isCopyLike() && !MI.isUnconditionalBranch())
	return false;
	}
	return true;
	}

	bool CoalescerPair::setRegisters(const MachineInstr *MI) {
	SrcReg = DstReg = 0;
	SrcIdx = DstIdx = 0;
	NewRC = nullptr;
	Flipped = CrossClass = false;

	unsigned Src, Dst, SrcSub, DstSub;
	if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
	return false;
	Partial = SrcSub \|\| DstSub;

	// If one register is a physreg, it must be Dst.
	if (TargetRegisterInfo::isPhysicalRegister(Src)) {
	if (TargetRegisterInfo::isPhysicalRegister(Dst))
	return false;
	std::swap(Src, Dst);
	std::swap(SrcSub, DstSub);
	Flipped = true;
	}

	const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();

	if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
	// Eliminate DstSub on a physreg.
	if (DstSub) {
	Dst = TRI.getSubReg(Dst, DstSub);
	if (!Dst) return false;
	DstSub = 0;
	}

	// Eliminate SrcSub by picking a corresponding Dst superregister.
	if (SrcSub) {
	Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
	if (!Dst) return false;
	} else if (!MRI.getRegClass(Src)->contains(Dst)) {
	return false;
	}
	} else {
	// Both registers are virtual.
	const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
	const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);

	// Both registers have subreg indices.
	if (SrcSub && DstSub) {
	// Copies between different sub-registers are never coalescable.
	if (Src == Dst && SrcSub != DstSub)
	return false;

	NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
	SrcIdx, DstIdx);
	if (!NewRC)
	return false;
	} else if (DstSub) {
	// SrcReg will be merged with a sub-register of DstReg.
	SrcIdx = DstSub;
	NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
	} else if (SrcSub) {
	// DstReg will be merged with a sub-register of SrcReg.
	DstIdx = SrcSub;
	NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
	} else {
	// This is a straight copy without sub-registers.
	NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
	}

	// The combined constraint may be impossible to satisfy.
	if (!NewRC)
	return false;

	// Prefer SrcReg to be a sub-register of DstReg.
	// FIXME: Coalescer should support subregs symmetrically.
	if (DstIdx && !SrcIdx) {
	std::swap(Src, Dst);
	std::swap(SrcIdx, DstIdx);
	Flipped = !Flipped;
	}

	CrossClass = NewRC != DstRC \|\| NewRC != SrcRC;
	}
	// Check our invariants
	assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
	assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
	"Cannot have a physical SubIdx");
	SrcReg = Src;
	DstReg = Dst;
	return true;
	}

	bool CoalescerPair::flip() {
	if (TargetRegisterInfo::isPhysicalRegister(DstReg))
	return false;
	std::swap(SrcReg, DstReg);
	std::swap(SrcIdx, DstIdx);
	Flipped = !Flipped;
	return true;
	}

	bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
	if (!MI)
	return false;
	unsigned Src, Dst, SrcSub, DstSub;
	if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
	return false;

	// Find the virtual register that is SrcReg.
	if (Dst == SrcReg) {
	std::swap(Src, Dst);
	std::swap(SrcSub, DstSub);
	} else if (Src != SrcReg) {
	return false;
	}

	// Now check that Dst matches DstReg.
	if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
	if (!TargetRegisterInfo::isPhysicalRegister(Dst))
	return false;
	assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
	// DstSub could be set for a physreg from INSERT_SUBREG.
	if (DstSub)
	Dst = TRI.getSubReg(Dst, DstSub);
	// Full copy of Src.
	if (!SrcSub)
	return DstReg == Dst;
	// This is a partial register copy. Check that the parts match.
	return TRI.getSubReg(DstReg, SrcSub) == Dst;
	} else {
	// DstReg is virtual.
	if (DstReg != Dst)
	return false;
	// Registers match, do the subregisters line up?
	return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
	TRI.composeSubRegIndices(DstIdx, DstSub);
	}
	}

	void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.setPreservesCFG();
	AU.addRequired<AAResultsWrapperPass>();
	AU.addRequired<LiveIntervals>();
	AU.addPreserved<LiveIntervals>();
	AU.addPreserved<SlotIndexes>();
	AU.addRequired<MachineLoopInfo>();
	AU.addPreserved<MachineLoopInfo>();
	AU.addPreservedID(MachineDominatorsID);
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	void RegisterCoalescer::eliminateDeadDefs() {
	SmallVector<unsigned, 8> NewRegs;
	LiveRangeEdit(nullptr, NewRegs, MF, LIS,
	nullptr, this).eliminateDeadDefs(DeadDefs);
	}

	void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
	// MI may be in WorkList. Make sure we don't visit it.
	ErasedInstrs.insert(MI);
	}

	bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
	MachineInstr *CopyMI) {
	assert(!CP.isPartial() && "This doesn't work for partial copies.");
	assert(!CP.isPhys() && "This doesn't work for physreg copies.");

	LiveInterval &IntA =
	LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
	LiveInterval &IntB =
	LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
	SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();

	// We have a non-trivially-coalescable copy with IntA being the source and
	// IntB being the dest, thus this defines a value number in IntB. If the
	// source value number (in IntA) is defined by a copy from B, see if we can
	// merge these two pieces of B into a single value number, eliminating a copy.
	// For example:
	//
	// A3 = B0
	// ...
	// B1 = A3 <- this copy
	//
	// In this case, B0 can be extended to where the B1 copy lives, allowing the
	// B1 value number to be replaced with B0 (which simplifies the B
	// liveinterval).

	// BValNo is a value number in B that is defined by a copy from A. 'B1' in
	// the example above.
	LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
	if (BS == IntB.end()) return false;
	VNInfo *BValNo = BS->valno;

	// Get the location that B is defined at. Two options: either this value has
	// an unknown definition point or it is defined at CopyIdx. If unknown, we
	// can't process it.
	if (BValNo->def != CopyIdx) return false;

	// AValNo is the value number in A that defines the copy, A3 in the example.
	SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
	LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
	// The live segment might not exist after fun with physreg coalescing.
	if (AS == IntA.end()) return false;
	VNInfo *AValNo = AS->valno;

	// If AValNo is defined as a copy from IntB, we can potentially process this.
	// Get the instruction that defines this value number.
	MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
	// Don't allow any partial copies, even if isCoalescable() allows them.
	if (!CP.isCoalescable(ACopyMI) \|\| !ACopyMI->isFullCopy())
	return false;

	// Get the Segment in IntB that this value number starts with.
	LiveInterval::iterator ValS =
	IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
	if (ValS == IntB.end())
	return false;

	// Make sure that the end of the live segment is inside the same block as
	// CopyMI.
	MachineInstr *ValSEndInst =
	LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
	if (!ValSEndInst \|\| ValSEndInst->getParent() != CopyMI->getParent())
	return false;

	// Okay, we now know that ValS ends in the same block that the CopyMI
	// live-range starts. If there are no intervening live segments between them
	// in IntB, we can merge them.
	if (ValS+1 != BS) return false;

	DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));

	SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
	// We are about to delete CopyMI, so need to remove it as the 'instruction
	// that defines this value #'. Update the valnum with the new defining
	// instruction #.
	BValNo->def = FillerStart;

	// Okay, we can merge them. We need to insert a new liverange:
	// [ValS.end, BS.begin) of either value number, then we merge the
	// two value numbers.
	IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));

	// Okay, merge "B1" into the same value number as "B0".
	if (BValNo != ValS->valno)
	IntB.MergeValueNumberInto(BValNo, ValS->valno);

	// Do the same for the subregister segments.
	for (LiveInterval::SubRange &S : IntB.subranges()) {
	VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
	S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
	VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
	if (SubBValNo != SubValSNo)
	S.MergeValueNumberInto(SubBValNo, SubValSNo);
	}

	DEBUG(dbgs() << " result = " << IntB << '\n');

	// If the source instruction was killing the source register before the
	// merge, unset the isKill marker given the live range has been extended.
	int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
	if (UIdx != -1) {
	ValSEndInst->getOperand(UIdx).setIsKill(false);
	}

	// Rewrite the copy. If the copy instruction was killing the destination
	// register before the merge, find the last use and trim the live range. That
	// will also add the isKill marker.
	CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
	if (AS->end == CopyIdx)
	shrinkToUses(&IntA);

	++numExtends;
	return true;
	}

	bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
	LiveInterval &IntB,
	VNInfo *AValNo,
	VNInfo *BValNo) {
	// If AValNo has PHI kills, conservatively assume that IntB defs can reach
	// the PHI values.
	if (LIS->hasPHIKill(IntA, AValNo))
	return true;

	for (LiveRange::Segment &ASeg : IntA.segments) {
	if (ASeg.valno != AValNo) continue;
	LiveInterval::iterator BI =
	std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
	if (BI != IntB.begin())
	--BI;
	for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
	if (BI->valno == BValNo)
	continue;
	if (BI->start <= ASeg.start && BI->end > ASeg.start)
	return true;
	if (BI->start > ASeg.start && BI->start < ASeg.end)
	return true;
	}
	}
	return false;
	}

	/// Copy segements with value number @p SrcValNo from liverange @p Src to live
	/// range @Dst and use value number @p DstValNo there.
	static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
	const LiveRange &Src, const VNInfo *SrcValNo)
	{
	for (const LiveRange::Segment &S : Src.segments) {
	if (S.valno != SrcValNo)
	continue;
	Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
	}
	}

	bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
	MachineInstr *CopyMI) {
	assert(!CP.isPhys());

	LiveInterval &IntA =
	LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
	LiveInterval &IntB =
	LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());

	// We found a non-trivially-coalescable copy with IntA being the source and
	// IntB being the dest, thus this defines a value number in IntB. If the
	// source value number (in IntA) is defined by a commutable instruction and
	// its other operand is coalesced to the copy dest register, see if we can
	// transform the copy into a noop by commuting the definition. For example,
	//
	// A3 = op A2 B0<kill>
	// ...
	// B1 = A3 <- this copy
	// ...
	// = op A3 <- more uses
	//
	// ==>
	//
	// B2 = op B0 A2<kill>
	// ...
	// B1 = B2 <- now an identity copy
	// ...
	// = op B2 <- more uses

	// BValNo is a value number in B that is defined by a copy from A. 'B1' in
	// the example above.
	SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
	VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
	assert(BValNo != nullptr && BValNo->def == CopyIdx);

	// AValNo is the value number in A that defines the copy, A3 in the example.
	VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
	assert(AValNo && !AValNo->isUnused() && "COPY source not live");
	if (AValNo->isPHIDef())
	return false;
	MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
	if (!DefMI)
	return false;
	if (!DefMI->isCommutable())
	return false;
	// If DefMI is a two-address instruction then commuting it will change the
	// destination register.
	int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
	assert(DefIdx != -1);
	unsigned UseOpIdx;
	if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
	return false;

	// FIXME: The code below tries to commute 'UseOpIdx' operand with some other
	// commutable operand which is expressed by 'CommuteAnyOperandIndex'value
	// passed to the method. That _other_ operand is chosen by
	// the findCommutedOpIndices() method.
	//
	// That is obviously an area for improvement in case of instructions having
	// more than 2 operands. For example, if some instruction has 3 commutable
	// operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
	// op#2<->op#3) of commute transformation should be considered/tried here.
	unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
	if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
	return false;

	MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
	unsigned NewReg = NewDstMO.getReg();
	if (NewReg != IntB.reg \|\| !IntB.Query(AValNo->def).isKill())
	return false;

	// Make sure there are no other definitions of IntB that would reach the
	// uses which the new definition can reach.
	if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
	return false;

	// If some of the uses of IntA.reg is already coalesced away, return false.
	// It's not possible to determine whether it's safe to perform the coalescing.
	for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) {
	MachineInstr *UseMI = MO.getParent();
	unsigned OpNo = &MO - &UseMI->getOperand(0);
	SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
	LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
	if (US == IntA.end() \|\| US->valno != AValNo)
	continue;
	// If this use is tied to a def, we can't rewrite the register.
	if (UseMI->isRegTiedToDefOperand(OpNo))
	return false;
	}

	DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
	<< *DefMI);

	// At this point we have decided that it is legal to do this
	// transformation. Start by commuting the instruction.
	MachineBasicBlock *MBB = DefMI->getParent();
	MachineInstr *NewMI =
	TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
	if (!NewMI)
	return false;
	if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
	TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
	!MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
	return false;
	if (NewMI != DefMI) {
	LIS->ReplaceMachineInstrInMaps(DefMI, NewMI);
	MachineBasicBlock::iterator Pos = DefMI;
	MBB->insert(Pos, NewMI);
	MBB->erase(DefMI);
	}

	// If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
	// A = or A, B
	// ...
	// B = A
	// ...
	// C = A<kill>
	// ...
	// = B

	// Update uses of IntA of the specific Val# with IntB.
	for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
	UE = MRI->use_end();
	UI != UE; /* ++UI is below because of possible MI removal */) {
	MachineOperand &UseMO = *UI;
	++UI;
	if (UseMO.isUndef())
	continue;
	MachineInstr *UseMI = UseMO.getParent();
	if (UseMI->isDebugValue()) {
	// FIXME These don't have an instruction index. Not clear we have enough
	// info to decide whether to do this replacement or not. For now do it.
	UseMO.setReg(NewReg);
	continue;
	}
	SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(true);
	LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
	assert(US != IntA.end() && "Use must be live");
	if (US->valno != AValNo)
	continue;
	// Kill flags are no longer accurate. They are recomputed after RA.
	UseMO.setIsKill(false);
	if (TargetRegisterInfo::isPhysicalRegister(NewReg))
	UseMO.substPhysReg(NewReg, *TRI);
	else
	UseMO.setReg(NewReg);
	if (UseMI == CopyMI)
	continue;
	if (!UseMI->isCopy())
	continue;
	if (UseMI->getOperand(0).getReg() != IntB.reg \|\|
	UseMI->getOperand(0).getSubReg())
	continue;

	// This copy will become a noop. If it's defining a new val#, merge it into
	// BValNo.
	SlotIndex DefIdx = UseIdx.getRegSlot();
	VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
	if (!DVNI)
	continue;
	DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
	assert(DVNI->def == DefIdx);
	BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
	for (LiveInterval::SubRange &S : IntB.subranges()) {
	VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
	if (!SubDVNI)
	continue;
	VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
	assert(SubBValNo->def == CopyIdx);
	S.MergeValueNumberInto(SubDVNI, SubBValNo);
	}

	ErasedInstrs.insert(UseMI);
	LIS->RemoveMachineInstrFromMaps(*UseMI);
	UseMI->eraseFromParent();
	}

	// Extend BValNo by merging in IntA live segments of AValNo. Val# definition
	// is updated.
	BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
	if (IntB.hasSubRanges()) {
	if (!IntA.hasSubRanges()) {
	LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
	IntA.createSubRangeFrom(Allocator, Mask, IntA);
	}
	SlotIndex AIdx = CopyIdx.getRegSlot(true);
	for (LiveInterval::SubRange &SA : IntA.subranges()) {
	VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
	assert(ASubValNo != nullptr);

	LaneBitmask AMask = SA.LaneMask;
	for (LiveInterval::SubRange &SB : IntB.subranges()) {
	LaneBitmask BMask = SB.LaneMask;
	LaneBitmask Common = BMask & AMask;
	if (Common.none())
	continue;

	DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
	<< " into " << PrintLaneMask(Common) << '\n');
	LaneBitmask BRest = BMask & ~AMask;
	LiveInterval::SubRange *CommonRange;
	if (BRest.any()) {
	SB.LaneMask = BRest;
	DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
	<< '\n');
	// Duplicate SubRange for newly merged common stuff.
	CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
	} else {
	// We van reuse the L SubRange.
	SB.LaneMask = Common;
	CommonRange = &SB;
	}
	LiveRange RangeCopy(SB, Allocator);

	VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
	assert(BSubValNo->def == CopyIdx);
	BSubValNo->def = ASubValNo->def;
	addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
	AMask &= ~BMask;
	}
	if (AMask.any()) {
	DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
	LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
	VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
	addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
	}
	}
	}

	BValNo->def = AValNo->def;
	addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
	DEBUG(dbgs() << "\t\textended: " << IntB << '\n');

	LIS->removeVRegDefAt(IntA, AValNo->def);

	DEBUG(dbgs() << "\t\ttrimmed: " << IntA << '\n');
	++numCommutes;
	return true;
	}

	/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
	/// defining a subregister.
	static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
	assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
	"This code cannot handle physreg aliasing");
	for (const MachineOperand &Op : MI.operands()) {
	if (!Op.isReg() \|\| !Op.isDef() \|\| Op.getReg() != Reg)
	continue;
	// Return true if we define the full register or don't care about the value
	// inside other subregisters.
	if (Op.getSubReg() == 0 \|\| Op.isUndef())
	return true;
	}
	return false;
	}

	bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
	MachineInstr *CopyMI,
	bool &IsDefCopy) {
	IsDefCopy = false;
	unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
	unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
	unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
	unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
	if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
	return false;

	LiveInterval &SrcInt = LIS->getInterval(SrcReg);
	SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
	VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
	assert(ValNo && "CopyMI input register not live");
	if (ValNo->isPHIDef() \|\| ValNo->isUnused())
	return false;
	MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
	if (!DefMI)
	return false;
	if (DefMI->isCopyLike()) {
	IsDefCopy = true;
	return false;
	}
	if (!TII->isAsCheapAsAMove(*DefMI))
	return false;
	if (!TII->isTriviallyReMaterializable(*DefMI, AA))
	return false;
	if (!definesFullReg(*DefMI, SrcReg))
	return false;
	bool SawStore = false;
	if (!DefMI->isSafeToMove(AA, SawStore))
	return false;
	const MCInstrDesc &MCID = DefMI->getDesc();
	if (MCID.getNumDefs() != 1)
	return false;
	// Only support subregister destinations when the def is read-undef.
	MachineOperand &DstOperand = CopyMI->getOperand(0);
	unsigned CopyDstReg = DstOperand.getReg();
	if (DstOperand.getSubReg() && !DstOperand.isUndef())
	return false;

	// If both SrcIdx and DstIdx are set, correct rematerialization would widen
	// the register substantially (beyond both source and dest size). This is bad
	// for performance since it can cascade through a function, introducing many
	// extra spills and fills (e.g. ARM can easily end up copying QQQQPR registers
	// around after a few subreg copies).
	if (SrcIdx && DstIdx)
	return false;

	const TargetRegisterClass DefRC = TII->getRegClass(MCID, 0, TRI, MF);
	if (!DefMI->isImplicitDef()) {
	if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
	unsigned NewDstReg = DstReg;

	unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
	DefMI->getOperand(0).getSubReg());
	if (NewDstIdx)
	NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);

	// Finally, make sure that the physical subregister that will be
	// constructed later is permitted for the instruction.
	if (!DefRC->contains(NewDstReg))
	return false;
	} else {
	// Theoretically, some stack frame reference could exist. Just make sure
	// it hasn't actually happened.
	assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
	"Only expect to deal with virtual or physical registers");
	}
	}

	DebugLoc DL = CopyMI->getDebugLoc();
	MachineBasicBlock *MBB = CopyMI->getParent();
	MachineBasicBlock::iterator MII =
	std::next(MachineBasicBlock::iterator(CopyMI));
	TII->reMaterialize(MBB, MII, DstReg, SrcIdx, DefMI, *TRI);
	MachineInstr &NewMI = *std::prev(MII);
	NewMI.setDebugLoc(DL);

	// In a situation like the following:
	// %vreg0:subreg = instr ; DefMI, subreg = DstIdx
	// %vreg1 = copy %vreg0:subreg ; CopyMI, SrcIdx = 0
	// instead of widening %vreg1 to the register class of %vreg0 simply do:
	// %vreg1 = instr
	const TargetRegisterClass *NewRC = CP.getNewRC();
	if (DstIdx != 0) {
	MachineOperand &DefMO = NewMI.getOperand(0);
	if (DefMO.getSubReg() == DstIdx) {
	assert(SrcIdx == 0 && CP.isFlipped()
	&& "Shouldn't have SrcIdx+DstIdx at this point");
	const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
	const TargetRegisterClass *CommonRC =
	TRI->getCommonSubClass(DefRC, DstRC);
	if (CommonRC != nullptr) {
	NewRC = CommonRC;
	DstIdx = 0;
	DefMO.setSubReg(0);
	DefMO.setIsUndef(false); // Only subregs can have def+undef.
	}
	}
	}

	// CopyMI may have implicit operands, save them so that we can transfer them
	// over to the newly materialized instruction after CopyMI is removed.
	SmallVector<MachineOperand, 4> ImplicitOps;
	ImplicitOps.reserve(CopyMI->getNumOperands() -
	CopyMI->getDesc().getNumOperands());
	for (unsigned I = CopyMI->getDesc().getNumOperands(),
	E = CopyMI->getNumOperands();
	I != E; ++I) {
	MachineOperand &MO = CopyMI->getOperand(I);
	if (MO.isReg()) {
	assert(MO.isImplicit() && "No explicit operands after implict operands.");
	// Discard VReg implicit defs.
	if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
	ImplicitOps.push_back(MO);
	}
	}

	LIS->ReplaceMachineInstrInMaps(*CopyMI, NewMI);
	CopyMI->eraseFromParent();
	ErasedInstrs.insert(CopyMI);

	// NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
	// We need to remember these so we can add intervals once we insert
	// NewMI into SlotIndexes.
	SmallVector<unsigned, 4> NewMIImplDefs;
	for (unsigned i = NewMI.getDesc().getNumOperands(),
	e = NewMI.getNumOperands();
	i != e; ++i) {
	MachineOperand &MO = NewMI.getOperand(i);
	if (MO.isReg() && MO.isDef()) {
	assert(MO.isImplicit() && MO.isDead() &&
	TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
	NewMIImplDefs.push_back(MO.getReg());
	}
	}

	if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
	unsigned NewIdx = NewMI.getOperand(0).getSubReg();

	if (DefRC != nullptr) {
	if (NewIdx)
	NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
	else
	NewRC = TRI->getCommonSubClass(NewRC, DefRC);
	assert(NewRC && "subreg chosen for remat incompatible with instruction");
	}
	// Remap subranges to new lanemask and change register class.
	LiveInterval &DstInt = LIS->getInterval(DstReg);
	for (LiveInterval::SubRange &SR : DstInt.subranges()) {
	SR.LaneMask = TRI->composeSubRegIndexLaneMask(DstIdx, SR.LaneMask);
	}
	MRI->setRegClass(DstReg, NewRC);

	// Update machine operands and add flags.
	updateRegDefsUses(DstReg, DstReg, DstIdx);
	NewMI.getOperand(0).setSubReg(NewIdx);
	// Add dead subregister definitions if we are defining the whole register
	// but only part of it is live.
	// This could happen if the rematerialization instruction is rematerializing
	// more than actually is used in the register.
	// An example would be:
	// vreg1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
	// ; Copying only part of the register here, but the rest is undef.
	// vreg2:sub_16bit<def, read-undef> = COPY vreg1:sub_16bit
	// ==>
	// ; Materialize all the constants but only using one
	// vreg2 = LOAD_CONSTANTS 5, 8
	//
	// at this point for the part that wasn't defined before we could have
	// subranges missing the definition.
	if (NewIdx == 0 && DstInt.hasSubRanges()) {
	SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
	SlotIndex DefIndex =
	CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
	LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
	VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
	for (LiveInterval::SubRange &SR : DstInt.subranges()) {
	if (!SR.liveAt(DefIndex))
	SR.createDeadDef(DefIndex, Alloc);
	MaxMask &= ~SR.LaneMask;
	}
	if (MaxMask.any()) {
	LiveInterval::SubRange *SR = DstInt.createSubRange(Alloc, MaxMask);
	SR->createDeadDef(DefIndex, Alloc);
	}
	}
	} else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
	// The New instruction may be defining a sub-register of what's actually
	// been asked for. If so it must implicitly define the whole thing.
	assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
	"Only expect virtual or physical registers in remat");
	NewMI.getOperand(0).setIsDead(true);
	NewMI.addOperand(MachineOperand::CreateReg(
	CopyDstReg, true /IsDef/, true /IsImp/, false /IsKill/));
	// Record small dead def live-ranges for all the subregisters
	// of the destination register.
	// Otherwise, variables that live through may miss some
	// interferences, thus creating invalid allocation.
	// E.g., i386 code:
	// vreg1 = somedef ; vreg1 GR8
	// vreg2 = remat ; vreg2 GR32
	// CL = COPY vreg2.sub_8bit
	// = somedef vreg1 ; vreg1 GR8
	// =>
	// vreg1 = somedef ; vreg1 GR8
	// ECX<def, dead> = remat ; CL<imp-def>
	// = somedef vreg1 ; vreg1 GR8
	// vreg1 will see the inteferences with CL but not with CH since
	// no live-ranges would have been created for ECX.
	// Fix that!
	SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
	for (MCRegUnitIterator Units(NewMI.getOperand(0).getReg(), TRI);
	Units.isValid(); ++Units)
	if (LiveRange LR = LIS->getCachedRegUnit(Units))
	LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
	}

	if (NewMI.getOperand(0).getSubReg())
	NewMI.getOperand(0).setIsUndef();

	// Transfer over implicit operands to the rematerialized instruction.
	for (MachineOperand &MO : ImplicitOps)
	NewMI.addOperand(MO);

	SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
	for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
	unsigned Reg = NewMIImplDefs[i];
	for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
	if (LiveRange LR = LIS->getCachedRegUnit(Units))
	LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
	}

	DEBUG(dbgs() << "Remat: " << NewMI);
	++NumReMats;

	// The source interval can become smaller because we removed a use.
	shrinkToUses(&SrcInt, &DeadDefs);
	if (!DeadDefs.empty()) {
	// If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
	// to describe DstReg instead.
	for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
	MachineInstr *UseMI = UseMO.getParent();
	if (UseMI->isDebugValue()) {
	UseMO.setReg(DstReg);
	DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
	}
	}
	eliminateDeadDefs();
	}

	return true;
	}

	bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
	// ProcessImpicitDefs may leave some copies of <undef> values, it only removes
	// local variables. When we have a copy like:
	//
	// %vreg1 = COPY %vreg2<undef>
	//
	// We delete the copy and remove the corresponding value number from %vreg1.
	// Any uses of that value number are marked as <undef>.

	// Note that we do not query CoalescerPair here but redo isMoveInstr as the
	// CoalescerPair may have a new register class with adjusted subreg indices
	// at this point.
	unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
	isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);

	SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
	const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
	// CopyMI is undef iff SrcReg is not live before the instruction.
	if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
	LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
	for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
	if ((SR.LaneMask & SrcMask).none())
	continue;
	if (SR.liveAt(Idx))
	return false;
	}
	} else if (SrcLI.liveAt(Idx))
	return false;

	DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");

	// Remove any DstReg segments starting at the instruction.
	LiveInterval &DstLI = LIS->getInterval(DstReg);
	SlotIndex RegIndex = Idx.getRegSlot();
	// Remove value or merge with previous one in case of a subregister def.
	if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
	VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
	DstLI.MergeValueNumberInto(VNI, PrevVNI);

	// The affected subregister segments can be removed.
	LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
	for (LiveInterval::SubRange &SR : DstLI.subranges()) {
	if ((SR.LaneMask & DstMask).none())
	continue;

	VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
	assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
	SR.removeValNo(SVNI);
	}
	DstLI.removeEmptySubRanges();
	} else
	LIS->removeVRegDefAt(DstLI, RegIndex);

	// Mark uses as undef.
	for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
	if (MO.isDef() /\|\| MO.isUndef()/)
	continue;
	const MachineInstr &MI = *MO.getParent();
	SlotIndex UseIdx = LIS->getInstructionIndex(MI);
	LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
	bool isLive;
	if (!UseMask.all() && DstLI.hasSubRanges()) {
	isLive = false;
	for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
	if ((SR.LaneMask & UseMask).none())
	continue;
	if (SR.liveAt(UseIdx)) {
	isLive = true;
	break;
	}
	}
	} else
	isLive = DstLI.liveAt(UseIdx);
	if (isLive)
	continue;
	MO.setIsUndef(true);
	DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
	}

	// A def of a subregister may be a use of the other subregisters, so
	// deleting a def of a subregister may also remove uses. Since CopyMI
	// is still part of the function (but about to be erased), mark all
	// defs of DstReg in it as <undef>, so that shrinkToUses would
	// ignore them.
	for (MachineOperand &MO : CopyMI->operands())
	if (MO.isReg() && MO.isDef() && MO.getReg() == DstReg)
	MO.setIsUndef(true);
	LIS->shrinkToUses(&DstLI);

	return true;
	}

	void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
	MachineOperand &MO, unsigned SubRegIdx) {
	LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubRegIdx);
	if (MO.isDef())
	Mask = ~Mask;
	bool IsUndef = true;
	for (const LiveInterval::SubRange &S : Int.subranges()) {
	if ((S.LaneMask & Mask).none())
	continue;
	if (S.liveAt(UseIdx)) {
	IsUndef = false;
	break;
	}
	}
	if (IsUndef) {
	MO.setIsUndef(true);
	// We found out some subregister use is actually reading an undefined
	// value. In some cases the whole vreg has become undefined at this
	// point so we have to potentially shrink the main range if the
	// use was ending a live segment there.
	LiveQueryResult Q = Int.Query(UseIdx);
	if (Q.valueOut() == nullptr)
	ShrinkMainRange = true;
	}
	}

	void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
	unsigned DstReg,
	unsigned SubIdx) {
	bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
	LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);

	if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
	for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
	unsigned SubReg = MO.getSubReg();
	if (SubReg == 0 \|\| MO.isUndef())
	continue;
	MachineInstr &MI = *MO.getParent();
	if (MI.isDebugValue())
	continue;
	SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(true);
	addUndefFlag(*DstInt, UseIdx, MO, SubReg);
	}
	}

	SmallPtrSet<MachineInstr*, 8> Visited;
	for (MachineRegisterInfo::reg_instr_iterator
	I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
	I != E; ) {
	MachineInstr UseMI = &(I++);

	// Each instruction can only be rewritten once because sub-register
	// composition is not always idempotent. When SrcReg != DstReg, rewriting
	// the UseMI operands removes them from the SrcReg use-def chain, but when
	// SrcReg is DstReg we could encounter UseMI twice if it has multiple
	// operands mentioning the virtual register.
	if (SrcReg == DstReg && !Visited.insert(UseMI).second)
	continue;

	SmallVector<unsigned,8> Ops;
	bool Reads, Writes;
	std::tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);

	// If SrcReg wasn't read, it may still be the case that DstReg is live-in
	// because SrcReg is a sub-register.
	if (DstInt && !Reads && SubIdx)
	Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));

	// Replace SrcReg with DstReg in all UseMI operands.
	for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
	MachineOperand &MO = UseMI->getOperand(Ops[i]);

	// Adjust <undef> flags in case of sub-register joins. We don't want to
	// turn a full def into a read-modify-write sub-register def and vice
	// versa.
	if (SubIdx && MO.isDef())
	MO.setIsUndef(!Reads);

	// A subreg use of a partially undef (super) register may be a complete
	// undef use now and then has to be marked that way.
	if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) {
	if (!DstInt->hasSubRanges()) {
	BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
	LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
	DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
	}
	SlotIndex MIIdx = UseMI->isDebugValue()
	? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
	: LIS->getInstructionIndex(*UseMI);
	SlotIndex UseIdx = MIIdx.getRegSlot(true);
	addUndefFlag(*DstInt, UseIdx, MO, SubIdx);
	}

	if (DstIsPhys)
	MO.substPhysReg(DstReg, *TRI);
	else
	MO.substVirtReg(DstReg, SubIdx, *TRI);
	}

	DEBUG({
	dbgs() << "\t\tupdated: ";
	if (!UseMI->isDebugValue())
	dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
	dbgs() << *UseMI;
	});
	}
	}

	bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
	// Always join simple intervals that are defined by a single copy from a
	// reserved register. This doesn't increase register pressure, so it is
	// always beneficial.
	if (!MRI->isReserved(CP.getDstReg())) {
	DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
	return false;
	}

	LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
	if (JoinVInt.containsOneValue())
	return true;

	DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
	return false;
	}

	bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {

	Again = false;
	DEBUG(dbgs() << LIS->getInstructionIndex(CopyMI) << '\t' << CopyMI);

	CoalescerPair CP(*TRI);
	if (!CP.setRegisters(CopyMI)) {
	DEBUG(dbgs() << "\tNot coalescable.\n");
	return false;
	}

	if (CP.getNewRC()) {
	auto SrcRC = MRI->getRegClass(CP.getSrcReg());
	auto DstRC = MRI->getRegClass(CP.getDstReg());
	unsigned SrcIdx = CP.getSrcIdx();
	unsigned DstIdx = CP.getDstIdx();
	if (CP.isFlipped()) {
	std::swap(SrcIdx, DstIdx);
	std::swap(SrcRC, DstRC);
	}
	if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
	CP.getNewRC())) {
	DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
	return false;
	}
	}

	// Dead code elimination. This really should be handled by MachineDCE, but
	// sometimes dead copies slip through, and we can't generate invalid live
	// ranges.
	if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
	DEBUG(dbgs() << "\tCopy is dead.\n");
	DeadDefs.push_back(CopyMI);
	eliminateDeadDefs();
	return true;
	}

	// Eliminate undefs.
	if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
	LIS->RemoveMachineInstrFromMaps(*CopyMI);
	CopyMI->eraseFromParent();
	return false; // Not coalescable.
	}

	// Coalesced copies are normally removed immediately, but transformations
	// like removeCopyByCommutingDef() can inadvertently create identity copies.
	// When that happens, just join the values and remove the copy.
	if (CP.getSrcReg() == CP.getDstReg()) {
	LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
	DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
	const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
	LiveQueryResult LRQ = LI.Query(CopyIdx);
	if (VNInfo *DefVNI = LRQ.valueDefined()) {
	VNInfo *ReadVNI = LRQ.valueIn();
	assert(ReadVNI && "No value before copy and no <undef> flag.");
	assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
	LI.MergeValueNumberInto(DefVNI, ReadVNI);

	// Process subregister liveranges.
	for (LiveInterval::SubRange &S : LI.subranges()) {
	LiveQueryResult SLRQ = S.Query(CopyIdx);
	if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
	VNInfo *SReadVNI = SLRQ.valueIn();
	S.MergeValueNumberInto(SDefVNI, SReadVNI);
	}
	}
	DEBUG(dbgs() << "\tMerged values: " << LI << '\n');
	}
	LIS->RemoveMachineInstrFromMaps(*CopyMI);
	CopyMI->eraseFromParent();
	return true;
	}

	// Enforce policies.
	if (CP.isPhys()) {
	DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
	<< " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
	<< '\n');
	if (!canJoinPhys(CP)) {
	// Before giving up coalescing, if definition of source is defined by
	// trivial computation, try rematerializing it.
	bool IsDefCopy;
	if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
	return true;
	if (IsDefCopy)
	Again = true; // May be possible to coalesce later.
	return false;
	}
	} else {
	// When possible, let DstReg be the larger interval.
	if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
	LIS->getInterval(CP.getDstReg()).size())
	CP.flip();

	DEBUG({
	dbgs() << "\tConsidering merging to "
	<< TRI->getRegClassName(CP.getNewRC()) << " with ";
	if (CP.getDstIdx() && CP.getSrcIdx())
	dbgs() << PrintReg(CP.getDstReg()) << " in "
	<< TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
	<< PrintReg(CP.getSrcReg()) << " in "
	<< TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
	else
	dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
	<< PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
	});
	}

	ShrinkMask = LaneBitmask::getNone();
	ShrinkMainRange = false;

	// Okay, attempt to join these two intervals. On failure, this returns false.
	// Otherwise, if one of the intervals being joined is a physreg, this method
	// always canonicalizes DstInt to be it. The output "SrcInt" will not have
	// been modified, so we can use this information below to update aliases.
	if (!joinIntervals(CP)) {
	// Coalescing failed.

	// If definition of source is defined by trivial computation, try
	// rematerializing it.
	bool IsDefCopy;
	if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
	return true;

	// If we can eliminate the copy without merging the live segments, do so
	// now.
	if (!CP.isPartial() && !CP.isPhys()) {
	if (adjustCopiesBackFrom(CP, CopyMI) \|\|
	removeCopyByCommutingDef(CP, CopyMI)) {
	LIS->RemoveMachineInstrFromMaps(*CopyMI);
	CopyMI->eraseFromParent();
	DEBUG(dbgs() << "\tTrivial!\n");
	return true;
	}
	}

	// Otherwise, we are unable to join the intervals.
	DEBUG(dbgs() << "\tInterference!\n");
	Again = true; // May be possible to coalesce later.
	return false;
	}

	// Coalescing to a virtual register that is of a sub-register class of the
	// other. Make sure the resulting register is set to the right register class.
	if (CP.isCrossClass()) {
	++numCrossRCs;
	MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
	}

	// Removing sub-register copies can ease the register class constraints.
	// Make sure we attempt to inflate the register class of DstReg.
	if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
	InflateRegs.push_back(CP.getDstReg());

	// CopyMI has been erased by joinIntervals at this point. Remove it from
	// ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
	// to the work list. This keeps ErasedInstrs from growing needlessly.
	ErasedInstrs.erase(CopyMI);

	// Rewrite all SrcReg operands to DstReg.
	// Also update DstReg operands to include DstIdx if it is set.
	if (CP.getDstIdx())
	updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
	updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());

	// Shrink subregister ranges if necessary.
	if (ShrinkMask.any()) {
	LiveInterval &LI = LIS->getInterval(CP.getDstReg());
	for (LiveInterval::SubRange &S : LI.subranges()) {
	if ((S.LaneMask & ShrinkMask).none())
	continue;
	DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
	<< ")\n");
	LIS->shrinkToUses(S, LI.reg);
	}
	LI.removeEmptySubRanges();
	}
	if (ShrinkMainRange) {
	LiveInterval &LI = LIS->getInterval(CP.getDstReg());
	shrinkToUses(&LI);
	}

	// SrcReg is guaranteed to be the register whose live interval that is
	// being merged.
	LIS->removeInterval(CP.getSrcReg());

	// Update regalloc hint.
	TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);

	DEBUG({
	dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
	<< " -> " << PrintReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
	dbgs() << "\tResult = ";
	if (CP.isPhys())
	dbgs() << PrintReg(CP.getDstReg(), TRI);
	else
	dbgs() << LIS->getInterval(CP.getDstReg());
	dbgs() << '\n';
	});

	++numJoins;
	return true;
	}

	bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
	unsigned DstReg = CP.getDstReg();
	+ unsigned SrcReg = CP.getSrcReg();
	assert(CP.isPhys() && "Must be a physreg copy");
	assert(MRI->isReserved(DstReg) && "Not a reserved register");
	- LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
	+ LiveInterval &RHS = LIS->getInterval(SrcReg);
	DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');

	assert(RHS.containsOneValue() && "Invalid join with reserved register");

	// Optimization for reserved registers like ESP. We can only merge with a
	// reserved physreg if RHS has a single value that is a copy of DstReg.
	// The live range of the reserved register will look like a set of dead defs
	// - we don't properly track the live range of reserved registers.

	// Deny any overlapping intervals. This depends on all the reserved
	// register live ranges to look like dead defs.
	if (!MRI->isConstantPhysReg(DstReg)) {
	for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
	// Abort if not all the regunits are reserved.
	for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) {
	if (!MRI->isReserved(*RI))
	return false;
	}
	if (RHS.overlaps(LIS->getRegUnit(*UI))) {
	DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
	return false;
	}
	}
	}

	// Skip any value computations, we are not adding new values to the
	// reserved register. Also skip merging the live ranges, the reserved
	// register live range doesn't need to be accurate as long as all the
	// defs are there.

	// Delete the identity copy.
	MachineInstr *CopyMI;
	if (CP.isFlipped()) {
	- CopyMI = MRI->getVRegDef(RHS.reg);
	+ // Physreg is copied into vreg
	+ // %vregY = COPY %X
	+ // ... //< no other def of %X here
	+ // use %vregY
	+ // =>
	+ // ...
	+ // use %X
	+ CopyMI = MRI->getVRegDef(SrcReg);
	} else {
	- if (!MRI->hasOneNonDBGUse(RHS.reg)) {
	+ // VReg is copied into physreg:
	+ // %vregX = def
	+ // ... //< no other def or use of %Y here
	+ // %Y = COPY %vregX
	+ // =>
	+ // %Y = def
	+ // ...
	+ if (!MRI->hasOneNonDBGUse(SrcReg)) {
	DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
	return false;
	}

	- MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
	- CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
	- const SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
	- const SlotIndex DestRegIdx = LIS->getInstructionIndex(*DestMI).getRegSlot();
	+ if (!LIS->intervalIsInOneMBB(RHS)) {
	+ DEBUG(dbgs() << "\t\tComplex control flow!\n");
	+ return false;
	+ }

	+ MachineInstr &DestMI = *MRI->getVRegDef(SrcReg);
	+ CopyMI = &*MRI->use_instr_nodbg_begin(SrcReg);
	+ SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
	+ SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
	+
	if (!MRI->isConstantPhysReg(DstReg)) {
	// We checked above that there are no interfering defs of the physical
	// register. However, for this case, where we intent to move up the def of
	// the physical register, we also need to check for interfering uses.
	SlotIndexes *Indexes = LIS->getSlotIndexes();
	for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
	SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
	MachineInstr *MI = LIS->getInstructionFromIndex(SI);
	if (MI->readsRegister(DstReg, TRI)) {
	DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
	return false;
	}

	// We must also check for clobbers caused by regmasks.
	for (const auto &MO : MI->operands()) {
	if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
	DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
	return false;
	}
	}
	}
	}

	// We're going to remove the copy which defines a physical reserved
	// register, so remove its valno, etc.
	- DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
	- << CopyRegIdx << "\n");
	+ DEBUG(dbgs() << "\t\tRemoving phys reg def of " << PrintReg(DstReg, TRI)
	+ << " at " << CopyRegIdx << "\n");

	LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
	// Create a new dead def at the new def location.
	for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
	LiveRange &LR = LIS->getRegUnit(*UI);
	LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
	}
	}

	LIS->RemoveMachineInstrFromMaps(*CopyMI);
	CopyMI->eraseFromParent();

	// We don't track kills for reserved registers.
	MRI->clearKillFlags(CP.getSrcReg());

	return true;
	}

	//===----------------------------------------------------------------------===//
	// Interference checking and interval joining
	//===----------------------------------------------------------------------===//
	//
	// In the easiest case, the two live ranges being joined are disjoint, and
	// there is no interference to consider. It is quite common, though, to have
	// overlapping live ranges, and we need to check if the interference can be
	// resolved.
	//
	// The live range of a single SSA value forms a sub-tree of the dominator tree.
	// This means that two SSA values overlap if and only if the def of one value
	// is contained in the live range of the other value. As a special case, the
	// overlapping values can be defined at the same index.
	//
	// The interference from an overlapping def can be resolved in these cases:
	//
	// 1. Coalescable copies. The value is defined by a copy that would become an
	// identity copy after joining SrcReg and DstReg. The copy instruction will
	// be removed, and the value will be merged with the source value.
	//
	// There can be several copies back and forth, causing many values to be
	// merged into one. We compute a list of ultimate values in the joined live
	// range as well as a mappings from the old value numbers.
	//
	// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
	// predecessors have a live out value. It doesn't cause real interference,
	// and can be merged into the value it overlaps. Like a coalescable copy, it
	// can be erased after joining.
	//
	// 3. Copy of external value. The overlapping def may be a copy of a value that
	// is already in the other register. This is like a coalescable copy, but
	// the live range of the source register must be trimmed after erasing the
	// copy instruction:
	//
	// %src = COPY %ext
	// %dst = COPY %ext <-- Remove this COPY, trim the live range of %ext.
	//
	// 4. Clobbering undefined lanes. Vector registers are sometimes built by
	// defining one lane at a time:
	//
	// %dst:ssub0<def,read-undef> = FOO
	// %src = BAR
	// %dst:ssub1<def> = COPY %src
	//
	// The live range of %src overlaps the %dst value defined by FOO, but
	// merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
	// which was undef anyway.
	//
	// The value mapping is more complicated in this case. The final live range
	// will have different value numbers for both FOO and BAR, but there is no
	// simple mapping from old to new values. It may even be necessary to add
	// new PHI values.
	//
	// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
	// is live, but never read. This can happen because we don't compute
	// individual live ranges per lane.
	//
	// %dst<def> = FOO
	// %src = BAR
	// %dst:ssub1<def> = COPY %src
	//
	// This kind of interference is only resolved locally. If the clobbered
	// lane value escapes the block, the join is aborted.

	namespace {
	/// Track information about values in a single virtual register about to be
	/// joined. Objects of this class are always created in pairs - one for each
	/// side of the CoalescerPair (or one for each lane of a side of the coalescer
	/// pair)
	class JoinVals {
	/// Live range we work on.
	LiveRange &LR;
	/// (Main) register we work on.
	const unsigned Reg;

	/// Reg (and therefore the values in this liverange) will end up as
	/// subregister SubIdx in the coalesced register. Either CP.DstIdx or
	/// CP.SrcIdx.
	const unsigned SubIdx;
	/// The LaneMask that this liverange will occupy the coalesced register. May
	/// be smaller than the lanemask produced by SubIdx when merging subranges.
	const LaneBitmask LaneMask;

	/// This is true when joining sub register ranges, false when joining main
	/// ranges.
	const bool SubRangeJoin;
	/// Whether the current LiveInterval tracks subregister liveness.
	const bool TrackSubRegLiveness;

	/// Values that will be present in the final live range.
	SmallVectorImpl<VNInfo*> &NewVNInfo;

	const CoalescerPair &CP;
	LiveIntervals *LIS;
	SlotIndexes *Indexes;
	const TargetRegisterInfo *TRI;

	/// Value number assignments. Maps value numbers in LI to entries in
	/// NewVNInfo. This is suitable for passing to LiveInterval::join().
	SmallVector<int, 8> Assignments;

	/// Conflict resolution for overlapping values.
	enum ConflictResolution {
	/// No overlap, simply keep this value.
	CR_Keep,

	/// Merge this value into OtherVNI and erase the defining instruction.
	/// Used for IMPLICIT_DEF, coalescable copies, and copies from external
	/// values.
	CR_Erase,

	/// Merge this value into OtherVNI but keep the defining instruction.
	/// This is for the special case where OtherVNI is defined by the same
	/// instruction.
	CR_Merge,

	/// Keep this value, and have it replace OtherVNI where possible. This
	/// complicates value mapping since OtherVNI maps to two different values
	/// before and after this def.
	/// Used when clobbering undefined or dead lanes.
	CR_Replace,

	/// Unresolved conflict. Visit later when all values have been mapped.
	CR_Unresolved,

	/// Unresolvable conflict. Abort the join.
	CR_Impossible
	};

	/// Per-value info for LI. The lane bit masks are all relative to the final
	/// joined register, so they can be compared directly between SrcReg and
	/// DstReg.
	struct Val {
	ConflictResolution Resolution;

	/// Lanes written by this def, 0 for unanalyzed values.
	LaneBitmask WriteLanes;

	/// Lanes with defined values in this register. Other lanes are undef and
	/// safe to clobber.
	LaneBitmask ValidLanes;

	/// Value in LI being redefined by this def.
	VNInfo *RedefVNI;

	/// Value in the other live range that overlaps this def, if any.
	VNInfo *OtherVNI;

	/// Is this value an IMPLICIT_DEF that can be erased?
	///
	/// IMPLICIT_DEF values should only exist at the end of a basic block that
	/// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
	/// safely erased if they are overlapping a live value in the other live
	/// interval.
	///
	/// Weird control flow graphs and incomplete PHI handling in
	/// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
	/// longer live ranges. Such IMPLICIT_DEF values should be treated like
	/// normal values.
	bool ErasableImplicitDef;

	/// True when the live range of this value will be pruned because of an
	/// overlapping CR_Replace value in the other live range.
	bool Pruned;

	/// True once Pruned above has been computed.
	bool PrunedComputed;

	Val() : Resolution(CR_Keep), WriteLanes(), ValidLanes(),
	RedefVNI(nullptr), OtherVNI(nullptr), ErasableImplicitDef(false),
	Pruned(false), PrunedComputed(false) {}

	bool isAnalyzed() const { return WriteLanes.any(); }
	};

	/// One entry per value number in LI.
	SmallVector<Val, 8> Vals;

	/// Compute the bitmask of lanes actually written by DefMI.
	/// Set Redef if there are any partial register definitions that depend on the
	/// previous value of the register.
	LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;

	/// Find the ultimate value that VNI was copied from.
	std::pair<const VNInfo,unsigned> followCopyChain(const VNInfo VNI) const;

	bool valuesIdentical(VNInfo Val0, VNInfo Val1, const JoinVals &Other) const;

	/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
	/// Return a conflict resolution when possible, but leave the hard cases as
	/// CR_Unresolved.
	/// Recursively calls computeAssignment() on this and Other, guaranteeing that
	/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
	/// The recursion always goes upwards in the dominator tree, making loops
	/// impossible.
	ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);

	/// Compute the value assignment for ValNo in RI.
	/// This may be called recursively by analyzeValue(), but never for a ValNo on
	/// the stack.
	void computeAssignment(unsigned ValNo, JoinVals &Other);

	/// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
	/// the extent of the tainted lanes in the block.
	///
	/// Multiple values in Other.LR can be affected since partial redefinitions
	/// can preserve previously tainted lanes.
	///
	/// 1 %dst = VLOAD <-- Define all lanes in %dst
	/// 2 %src = FOO <-- ValNo to be joined with %dst:ssub0
	/// 3 %dst:ssub1 = BAR <-- Partial redef doesn't clear taint in ssub0
	/// 4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
	///
	/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
	/// entry to TaintedVals.
	///
	/// Returns false if the tainted lanes extend beyond the basic block.
	bool taintExtent(unsigned, LaneBitmask, JoinVals&,
	SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> >&);

	/// Return true if MI uses any of the given Lanes from Reg.
	/// This does not include partial redefinitions of Reg.
	bool usesLanes(const MachineInstr &MI, unsigned, unsigned, LaneBitmask) const;

	/// Determine if ValNo is a copy of a value number in LR or Other.LR that will
	/// be pruned:
	///
	/// %dst = COPY %src
	/// %src = COPY %dst <-- This value to be pruned.
	/// %dst = COPY %src <-- This value is a copy of a pruned value.
	bool isPrunedValue(unsigned ValNo, JoinVals &Other);

	public:
	JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask,
	SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
	LiveIntervals lis, const TargetRegisterInfo TRI, bool SubRangeJoin,
	bool TrackSubRegLiveness)
	: LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
	SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
	NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
	TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
	{}

	/// Analyze defs in LR and compute a value mapping in NewVNInfo.
	/// Returns false if any conflicts were impossible to resolve.
	bool mapValues(JoinVals &Other);

	/// Try to resolve conflicts that require all values to be mapped.
	/// Returns false if any conflicts were impossible to resolve.
	bool resolveConflicts(JoinVals &Other);

	/// Prune the live range of values in Other.LR where they would conflict with
	/// CR_Replace values in LR. Collect end points for restoring the live range
	/// after joining.
	void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
	bool changeInstrs);

	/// Removes subranges starting at copies that get removed. This sometimes
	/// happens when undefined subranges are copied around. These ranges contain
	/// no useful information and can be removed.
	void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask);

	/// Pruning values in subranges can lead to removing segments in these
	/// subranges started by IMPLICIT_DEFs. The corresponding segments in
	/// the main range also need to be removed. This function will mark
	/// the corresponding values in the main range as pruned, so that
	/// eraseInstrs can do the final cleanup.
	/// The parameter @p LI must be the interval whose main range is the
	/// live range LR.
	void pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange);

	/// Erase any machine instructions that have been coalesced away.
	/// Add erased instructions to ErasedInstrs.
	/// Add foreign virtual registers to ShrinkRegs if their live range ended at
	/// the erased instrs.
	void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
	SmallVectorImpl<unsigned> &ShrinkRegs,
	LiveInterval *LI = nullptr);

	/// Remove liverange defs at places where implicit defs will be removed.
	void removeImplicitDefs();

	/// Get the value assignments suitable for passing to LiveInterval::join.
	const int *getAssignments() const { return Assignments.data(); }
	};
	} // end anonymous namespace

	LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
	const {
	LaneBitmask L;
	for (const MachineOperand &MO : DefMI->operands()) {
	if (!MO.isReg() \|\| MO.getReg() != Reg \|\| !MO.isDef())
	continue;
	L \|= TRI->getSubRegIndexLaneMask(
	TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
	if (MO.readsReg())
	Redef = true;
	}
	return L;
	}

	std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
	const VNInfo *VNI) const {
	unsigned Reg = this->Reg;

	while (!VNI->isPHIDef()) {
	SlotIndex Def = VNI->def;
	MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
	assert(MI && "No defining instruction");
	if (!MI->isFullCopy())
	return std::make_pair(VNI, Reg);
	unsigned SrcReg = MI->getOperand(1).getReg();
	if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
	return std::make_pair(VNI, Reg);

	const LiveInterval &LI = LIS->getInterval(SrcReg);
	const VNInfo *ValueIn;
	// No subrange involved.
	if (!SubRangeJoin \|\| !LI.hasSubRanges()) {
	LiveQueryResult LRQ = LI.Query(Def);
	ValueIn = LRQ.valueIn();
	} else {
	// Query subranges. Pick the first matching one.
	ValueIn = nullptr;
	for (const LiveInterval::SubRange &S : LI.subranges()) {
	// Transform lanemask to a mask in the joined live interval.
	LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
	if ((SMask & LaneMask).none())
	continue;
	LiveQueryResult LRQ = S.Query(Def);
	ValueIn = LRQ.valueIn();
	break;
	}
	}
	if (ValueIn == nullptr)
	break;
	VNI = ValueIn;
	Reg = SrcReg;
	}
	return std::make_pair(VNI, Reg);
	}

	bool JoinVals::valuesIdentical(VNInfo Value0, VNInfo Value1,
	const JoinVals &Other) const {
	const VNInfo *Orig0;
	unsigned Reg0;
	std::tie(Orig0, Reg0) = followCopyChain(Value0);
	if (Orig0 == Value1)
	return true;

	const VNInfo *Orig1;
	unsigned Reg1;
	std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);

	// The values are equal if they are defined at the same place and use the
	// same register. Note that we cannot compare VNInfos directly as some of
	// them might be from a copy created in mergeSubRangeInto() while the other
	// is from the original LiveInterval.
	return Orig0->def == Orig1->def && Reg0 == Reg1;
	}

	JoinVals::ConflictResolution
	JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
	Val &V = Vals[ValNo];
	assert(!V.isAnalyzed() && "Value has already been analyzed!");
	VNInfo *VNI = LR.getValNumInfo(ValNo);
	if (VNI->isUnused()) {
	V.WriteLanes = LaneBitmask::getAll();
	return CR_Keep;
	}

	// Get the instruction defining this value, compute the lanes written.
	const MachineInstr *DefMI = nullptr;
	if (VNI->isPHIDef()) {
	// Conservatively assume that all lanes in a PHI are valid.
	LaneBitmask Lanes = SubRangeJoin ? LaneBitmask(1)
	: TRI->getSubRegIndexLaneMask(SubIdx);
	V.ValidLanes = V.WriteLanes = Lanes;
	} else {
	DefMI = Indexes->getInstructionFromIndex(VNI->def);
	assert(DefMI != nullptr);
	if (SubRangeJoin) {
	// We don't care about the lanes when joining subregister ranges.
	V.WriteLanes = V.ValidLanes = LaneBitmask(1);
	if (DefMI->isImplicitDef()) {
	V.ValidLanes = LaneBitmask::getNone();
	V.ErasableImplicitDef = true;
	}
	} else {
	bool Redef = false;
	V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);

	// If this is a read-modify-write instruction, there may be more valid
	// lanes than the ones written by this instruction.
	// This only covers partial redef operands. DefMI may have normal use
	// operands reading the register. They don't contribute valid lanes.
	//
	// This adds ssub1 to the set of valid lanes in %src:
	//
	// %src:ssub1<def> = FOO
	//
	// This leaves only ssub1 valid, making any other lanes undef:
	//
	// %src:ssub1<def,read-undef> = FOO %src:ssub2
	//
	// The <read-undef> flag on the def operand means that old lane values are
	// not important.
	if (Redef) {
	V.RedefVNI = LR.Query(VNI->def).valueIn();
	assert((TrackSubRegLiveness \|\| V.RedefVNI) &&
	"Instruction is reading nonexistent value");
	if (V.RedefVNI != nullptr) {
	computeAssignment(V.RedefVNI->id, Other);
	V.ValidLanes \|= Vals[V.RedefVNI->id].ValidLanes;
	}
	}

	// An IMPLICIT_DEF writes undef values.
	if (DefMI->isImplicitDef()) {
	// We normally expect IMPLICIT_DEF values to be live only until the end
	// of their block. If the value is really live longer and gets pruned in
	// another block, this flag is cleared again.
	V.ErasableImplicitDef = true;
	V.ValidLanes &= ~V.WriteLanes;
	}
	}
	}

	// Find the value in Other that overlaps VNI->def, if any.
	LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);

	// It is possible that both values are defined by the same instruction, or
	// the values are PHIs defined in the same block. When that happens, the two
	// values should be merged into one, but not into any preceding value.
	// The first value defined or visited gets CR_Keep, the other gets CR_Merge.
	if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
	assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");

	// One value stays, the other is merged. Keep the earlier one, or the first
	// one we see.
	if (OtherVNI->def < VNI->def)
	Other.computeAssignment(OtherVNI->id, *this);
	else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
	// This is an early-clobber def overlapping a live-in value in the other
	// register. Not mergeable.
	V.OtherVNI = OtherLRQ.valueIn();
	return CR_Impossible;
	}
	V.OtherVNI = OtherVNI;
	Val &OtherV = Other.Vals[OtherVNI->id];
	// Keep this value, check for conflicts when analyzing OtherVNI.
	if (!OtherV.isAnalyzed())
	return CR_Keep;
	// Both sides have been analyzed now.
	// Allow overlapping PHI values. Any real interference would show up in a
	// predecessor, the PHI itself can't introduce any conflicts.
	if (VNI->isPHIDef())
	return CR_Merge;
	if ((V.ValidLanes & OtherV.ValidLanes).any())
	// Overlapping lanes can't be resolved.
	return CR_Impossible;
	else
	return CR_Merge;
	}

	// No simultaneous def. Is Other live at the def?
	V.OtherVNI = OtherLRQ.valueIn();
	if (!V.OtherVNI)
	// No overlap, no conflict.
	return CR_Keep;

	assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");

	// We have overlapping values, or possibly a kill of Other.
	// Recursively compute assignments up the dominator tree.
	Other.computeAssignment(V.OtherVNI->id, *this);
	Val &OtherV = Other.Vals[V.OtherVNI->id];

	// Check if OtherV is an IMPLICIT_DEF that extends beyond its basic block.
	// This shouldn't normally happen, but ProcessImplicitDefs can leave such
	// IMPLICIT_DEF instructions behind, and there is nothing wrong with it
	// technically.
	//
	// When it happens, treat that IMPLICIT_DEF as a normal value, and don't try
	// to erase the IMPLICIT_DEF instruction.
	if (OtherV.ErasableImplicitDef && DefMI &&
	DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
	DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
	<< " extends into BB#" << DefMI->getParent()->getNumber()
	<< ", keeping it.\n");
	OtherV.ErasableImplicitDef = false;
	}

	// Allow overlapping PHI values. Any real interference would show up in a
	// predecessor, the PHI itself can't introduce any conflicts.
	if (VNI->isPHIDef())
	return CR_Replace;

	// Check for simple erasable conflicts.
	if (DefMI->isImplicitDef()) {
	// We need the def for the subregister if there is nothing else live at the
	// subrange at this point.
	if (TrackSubRegLiveness
	&& (V.WriteLanes & (OtherV.ValidLanes \| OtherV.WriteLanes)).none())
	return CR_Replace;
	return CR_Erase;
	}

	// Include the non-conflict where DefMI is a coalescable copy that kills
	// OtherVNI. We still want the copy erased and value numbers merged.
	if (CP.isCoalescable(DefMI)) {
	// Some of the lanes copied from OtherVNI may be undef, making them undef
	// here too.
	V.ValidLanes &= ~V.WriteLanes \| OtherV.ValidLanes;
	return CR_Erase;
	}

	// This may not be a real conflict if DefMI simply kills Other and defines
	// VNI.
	if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
	return CR_Keep;

	// Handle the case where VNI and OtherVNI can be proven to be identical:
	//
	// %other = COPY %ext
	// %this = COPY %ext <-- Erase this copy
	//
	if (DefMI->isFullCopy() && !CP.isPartial()
	&& valuesIdentical(VNI, V.OtherVNI, Other))
	return CR_Erase;

	// If the lanes written by this instruction were all undef in OtherVNI, it is
	// still safe to join the live ranges. This can't be done with a simple value
	// mapping, though - OtherVNI will map to multiple values:
	//
	// 1 %dst:ssub0 = FOO <-- OtherVNI
	// 2 %src = BAR <-- VNI
	// 3 %dst:ssub1 = COPY %src<kill> <-- Eliminate this copy.
	// 4 BAZ %dst<kill>
	// 5 QUUX %src<kill>
	//
	// Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
	// handles this complex value mapping.
	if ((V.WriteLanes & OtherV.ValidLanes).none())
	return CR_Replace;

	// If the other live range is killed by DefMI and the live ranges are still
	// overlapping, it must be because we're looking at an early clobber def:
	//
	// %dst<def,early-clobber> = ASM %src<kill>
	//
	// In this case, it is illegal to merge the two live ranges since the early
	// clobber def would clobber %src before it was read.
	if (OtherLRQ.isKill()) {
	// This case where the def doesn't overlap the kill is handled above.
	assert(VNI->def.isEarlyClobber() &&
	"Only early clobber defs can overlap a kill");
	return CR_Impossible;
	}

	// VNI is clobbering live lanes in OtherVNI, but there is still the
	// possibility that no instructions actually read the clobbered lanes.
	// If we're clobbering all the lanes in OtherVNI, at least one must be read.
	// Otherwise Other.RI wouldn't be live here.
	if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes).none())
	return CR_Impossible;

	// We need to verify that no instructions are reading the clobbered lanes. To
	// save compile time, we'll only check that locally. Don't allow the tainted
	// value to escape the basic block.
	MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
	if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
	return CR_Impossible;

	// There are still some things that could go wrong besides clobbered lanes
	// being read, for example OtherVNI may be only partially redefined in MBB,
	// and some clobbered lanes could escape the block. Save this analysis for
	// resolveConflicts() when all values have been mapped. We need to know
	// RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
	// that now - the recursive analyzeValue() calls must go upwards in the
	// dominator tree.
	return CR_Unresolved;
	}

	void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
	Val &V = Vals[ValNo];
	if (V.isAnalyzed()) {
	// Recursion should always move up the dominator tree, so ValNo is not
	// supposed to reappear before it has been assigned.
	assert(Assignments[ValNo] != -1 && "Bad recursion?");
	return;
	}
	switch ((V.Resolution = analyzeValue(ValNo, Other))) {
	case CR_Erase:
	case CR_Merge:
	// Merge this ValNo into OtherVNI.
	assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
	assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
	Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
	DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
	<< LR.getValNumInfo(ValNo)->def << " into "
	<< PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
	<< V.OtherVNI->def << " --> @"
	<< NewVNInfo[Assignments[ValNo]]->def << '\n');
	break;
	case CR_Replace:
	case CR_Unresolved: {
	// The other value is going to be pruned if this join is successful.
	assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
	Val &OtherV = Other.Vals[V.OtherVNI->id];
	// We cannot erase an IMPLICIT_DEF if we don't have valid values for all
	// its lanes.
	if ((OtherV.WriteLanes & ~V.ValidLanes).any() && TrackSubRegLiveness)
	OtherV.ErasableImplicitDef = false;
	OtherV.Pruned = true;
	LLVM_FALLTHROUGH;
	}
	default:
	// This value number needs to go in the final joined live range.
	Assignments[ValNo] = NewVNInfo.size();
	NewVNInfo.push_back(LR.getValNumInfo(ValNo));
	break;
	}
	}

	bool JoinVals::mapValues(JoinVals &Other) {
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	computeAssignment(i, Other);
	if (Vals[i].Resolution == CR_Impossible) {
	DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
	<< '@' << LR.getValNumInfo(i)->def << '\n');
	return false;
	}
	}
	return true;
	}

	bool JoinVals::
	taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
	SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> > &TaintExtent) {
	VNInfo *VNI = LR.getValNumInfo(ValNo);
	MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
	SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);

	// Scan Other.LR from VNI.def to MBBEnd.
	LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
	assert(OtherI != Other.LR.end() && "No conflict?");
	do {
	// OtherI is pointing to a tainted value. Abort the join if the tainted
	// lanes escape the block.
	SlotIndex End = OtherI->end;
	if (End >= MBBEnd) {
	DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
	<< OtherI->valno->id << '@' << OtherI->start << '\n');
	return false;
	}
	DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
	<< OtherI->valno->id << '@' << OtherI->start
	<< " to " << End << '\n');
	// A dead def is not a problem.
	if (End.isDead())
	break;
	TaintExtent.push_back(std::make_pair(End, TaintedLanes));

	// Check for another def in the MBB.
	if (++OtherI == Other.LR.end() \|\| OtherI->start >= MBBEnd)
	break;

	// Lanes written by the new def are no longer tainted.
	const Val &OV = Other.Vals[OtherI->valno->id];
	TaintedLanes &= ~OV.WriteLanes;
	if (!OV.RedefVNI)
	break;
	} while (TaintedLanes.any());
	return true;
	}

	bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
	LaneBitmask Lanes) const {
	if (MI.isDebugValue())
	return false;
	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isReg() \|\| MO.isDef() \|\| MO.getReg() != Reg)
	continue;
	if (!MO.readsReg())
	continue;
	unsigned S = TRI->composeSubRegIndices(SubIdx, MO.getSubReg());
	if ((Lanes & TRI->getSubRegIndexLaneMask(S)).any())
	return true;
	}
	return false;
	}

	bool JoinVals::resolveConflicts(JoinVals &Other) {
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	Val &V = Vals[i];
	assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
	if (V.Resolution != CR_Unresolved)
	continue;
	DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
	<< '@' << LR.getValNumInfo(i)->def << '\n');
	if (SubRangeJoin)
	return false;

	++NumLaneConflicts;
	assert(V.OtherVNI && "Inconsistent conflict resolution.");
	VNInfo *VNI = LR.getValNumInfo(i);
	const Val &OtherV = Other.Vals[V.OtherVNI->id];

	// VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
	// join, those lanes will be tainted with a wrong value. Get the extent of
	// the tainted lanes.
	LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
	SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent;
	if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
	// Tainted lanes would extend beyond the basic block.
	return false;

	assert(!TaintExtent.empty() && "There should be at least one conflict.");

	// Now look at the instructions from VNI->def to TaintExtent (inclusive).
	MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
	MachineBasicBlock::iterator MI = MBB->begin();
	if (!VNI->isPHIDef()) {
	MI = Indexes->getInstructionFromIndex(VNI->def);
	// No need to check the instruction defining VNI for reads.
	++MI;
	}
	assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
	"Interference ends on VNI->def. Should have been handled earlier");
	MachineInstr *LastMI =
	Indexes->getInstructionFromIndex(TaintExtent.front().first);
	assert(LastMI && "Range must end at a proper instruction");
	unsigned TaintNum = 0;
	for (;;) {
	assert(MI != MBB->end() && "Bad LastMI");
	if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
	DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
	return false;
	}
	// LastMI is the last instruction to use the current value.
	if (&*MI == LastMI) {
	if (++TaintNum == TaintExtent.size())
	break;
	LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
	assert(LastMI && "Range must end at a proper instruction");
	TaintedLanes = TaintExtent[TaintNum].second;
	}
	++MI;
	}

	// The tainted lanes are unused.
	V.Resolution = CR_Replace;
	++NumLaneResolves;
	}
	return true;
	}

	bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
	Val &V = Vals[ValNo];
	if (V.Pruned \|\| V.PrunedComputed)
	return V.Pruned;

	if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
	return V.Pruned;

	// Follow copies up the dominator tree and check if any intermediate value
	// has been pruned.
	V.PrunedComputed = true;
	V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
	return V.Pruned;
	}

	void JoinVals::pruneValues(JoinVals &Other,
	SmallVectorImpl<SlotIndex> &EndPoints,
	bool changeInstrs) {
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	SlotIndex Def = LR.getValNumInfo(i)->def;
	switch (Vals[i].Resolution) {
	case CR_Keep:
	break;
	case CR_Replace: {
	// This value takes precedence over the value in Other.LR.
	LIS->pruneValue(Other.LR, Def, &EndPoints);
	// Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
	// instructions are only inserted to provide a live-out value for PHI
	// predecessors, so the instruction should simply go away once its value
	// has been replaced.
	Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
	bool EraseImpDef = OtherV.ErasableImplicitDef &&
	OtherV.Resolution == CR_Keep;
	if (!Def.isBlock()) {
	if (changeInstrs) {
	// Remove <def,read-undef> flags. This def is now a partial redef.
	// Also remove <def,dead> flags since the joined live range will
	// continue past this instruction.
	for (MachineOperand &MO :
	Indexes->getInstructionFromIndex(Def)->operands()) {
	if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
	if (MO.getSubReg() != 0)
	MO.setIsUndef(EraseImpDef);
	MO.setIsDead(false);
	}
	}
	}
	// This value will reach instructions below, but we need to make sure
	// the live range also reaches the instruction at Def.
	if (!EraseImpDef)
	EndPoints.push_back(Def);
	}
	DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
	<< ": " << Other.LR << '\n');
	break;
	}
	case CR_Erase:
	case CR_Merge:
	if (isPrunedValue(i, Other)) {
	// This value is ultimately a copy of a pruned value in LR or Other.LR.
	// We can no longer trust the value mapping computed by
	// computeAssignment(), the value that was originally copied could have
	// been replaced.
	LIS->pruneValue(LR, Def, &EndPoints);
	DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
	<< Def << ": " << LR << '\n');
	}
	break;
	case CR_Unresolved:
	case CR_Impossible:
	llvm_unreachable("Unresolved conflicts");
	}
	}
	}

	void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
	// Look for values being erased.
	bool DidPrune = false;
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	if (Vals[i].Resolution != CR_Erase)
	continue;

	// Check subranges at the point where the copy will be removed.
	SlotIndex Def = LR.getValNumInfo(i)->def;
	for (LiveInterval::SubRange &S : LI.subranges()) {
	LiveQueryResult Q = S.Query(Def);

	// If a subrange starts at the copy then an undefined value has been
	// copied and we must remove that subrange value as well.
	VNInfo *ValueOut = Q.valueOutOrDead();
	if (ValueOut != nullptr && Q.valueIn() == nullptr) {
	DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
	<< " at " << Def << "\n");
	LIS->pruneValue(S, Def, nullptr);
	DidPrune = true;
	// Mark value number as unused.
	ValueOut->markUnused();
	continue;
	}
	// If a subrange ends at the copy, then a value was copied but only
	// partially used later. Shrink the subregister range appropriately.
	if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
	DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask)
	<< " at " << Def << "\n");
	ShrinkMask \|= S.LaneMask;
	}
	}
	}
	if (DidPrune)
	LI.removeEmptySubRanges();
	}

	/// Check if any of the subranges of @p LI contain a definition at @p Def.
	static bool isDefInSubRange(LiveInterval &LI, SlotIndex Def) {
	for (LiveInterval::SubRange &SR : LI.subranges()) {
	if (VNInfo *VNI = SR.Query(Def).valueOutOrDead())
	if (VNI->def == Def)
	return true;
	}
	return false;
	}

	void JoinVals::pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange) {
	assert(&static_cast<LiveRange&>(LI) == &LR);

	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	if (Vals[i].Resolution != CR_Keep)
	continue;
	VNInfo *VNI = LR.getValNumInfo(i);
	if (VNI->isUnused() \|\| VNI->isPHIDef() \|\| isDefInSubRange(LI, VNI->def))
	continue;
	Vals[i].Pruned = true;
	ShrinkMainRange = true;
	}
	}

	void JoinVals::removeImplicitDefs() {
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	Val &V = Vals[i];
	if (V.Resolution != CR_Keep \|\| !V.ErasableImplicitDef \|\| !V.Pruned)
	continue;

	VNInfo *VNI = LR.getValNumInfo(i);
	VNI->markUnused();
	LR.removeValNo(VNI);
	}
	}

	void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
	SmallVectorImpl<unsigned> &ShrinkRegs,
	LiveInterval *LI) {
	for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
	// Get the def location before markUnused() below invalidates it.
	SlotIndex Def = LR.getValNumInfo(i)->def;
	switch (Vals[i].Resolution) {
	case CR_Keep: {
	// If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
	// longer. The IMPLICIT_DEF instructions are only inserted by
	// PHIElimination to guarantee that all PHI predecessors have a value.
	if (!Vals[i].ErasableImplicitDef \|\| !Vals[i].Pruned)
	break;
	// Remove value number i from LR.
	// For intervals with subranges, removing a segment from the main range
	// may require extending the previous segment: for each definition of
	// a subregister, there will be a corresponding def in the main range.
	// That def may fall in the middle of a segment from another subrange.
	// In such cases, removing this def from the main range must be
	// complemented by extending the main range to account for the liveness
	// of the other subrange.
	VNInfo *VNI = LR.getValNumInfo(i);
	SlotIndex Def = VNI->def;
	// The new end point of the main range segment to be extended.
	SlotIndex NewEnd;
	if (LI != nullptr) {
	LiveRange::iterator I = LR.FindSegmentContaining(Def);
	assert(I != LR.end());
	// Do not extend beyond the end of the segment being removed.
	// The segment may have been pruned in preparation for joining
	// live ranges.
	NewEnd = I->end;
	}

	LR.removeValNo(VNI);
	// Note that this VNInfo is reused and still referenced in NewVNInfo,
	// make it appear like an unused value number.
	VNI->markUnused();

	if (LI != nullptr && LI->hasSubRanges()) {
	assert(static_cast<LiveRange*>(LI) == &LR);
	// Determine the end point based on the subrange information:
	// minimum of (earliest def of next segment,
	// latest end point of containing segment)
	SlotIndex ED, LE;
	for (LiveInterval::SubRange &SR : LI->subranges()) {
	LiveRange::iterator I = SR.find(Def);
	if (I == SR.end())
	continue;
	if (I->start > Def)
	ED = ED.isValid() ? std::min(ED, I->start) : I->start;
	else
	LE = LE.isValid() ? std::max(LE, I->end) : I->end;
	}
	if (LE.isValid())
	NewEnd = std::min(NewEnd, LE);
	if (ED.isValid())
	NewEnd = std::min(NewEnd, ED);

	// We only want to do the extension if there was a subrange that
	// was live across Def.
	if (LE.isValid()) {
	LiveRange::iterator S = LR.find(Def);
	if (S != LR.begin())
	std::prev(S)->end = NewEnd;
	}
	}
	DEBUG({
	dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n';
	if (LI != nullptr)
	dbgs() << "\t\t LHS = " << *LI << '\n';
	});
	LLVM_FALLTHROUGH;
	}

	case CR_Erase: {
	MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
	assert(MI && "No instruction to erase");
	if (MI->isCopy()) {
	unsigned Reg = MI->getOperand(1).getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg) &&
	Reg != CP.getSrcReg() && Reg != CP.getDstReg())
	ShrinkRegs.push_back(Reg);
	}
	ErasedInstrs.insert(MI);
	DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
	LIS->RemoveMachineInstrFromMaps(*MI);
	MI->eraseFromParent();
	break;
	}
	default:
	break;
	}
	}
	}

	void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
	LaneBitmask LaneMask,
	const CoalescerPair &CP) {
	SmallVector<VNInfo*, 16> NewVNInfo;
	JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
	NewVNInfo, CP, LIS, TRI, true, true);
	JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
	NewVNInfo, CP, LIS, TRI, true, true);

	// Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
	// We should be able to resolve all conflicts here as we could successfully do
	// it on the mainrange already. There is however a problem when multiple
	// ranges get mapped to the "overflow" lane mask bit which creates unexpected
	// interferences.
	if (!LHSVals.mapValues(RHSVals) \|\| !RHSVals.mapValues(LHSVals)) {
	// We already determined that it is legal to merge the intervals, so this
	// should never fail.
	llvm_unreachable("*** Couldn't join subrange!\n");
	}
	if (!LHSVals.resolveConflicts(RHSVals) \|\|
	!RHSVals.resolveConflicts(LHSVals)) {
	// We already determined that it is legal to merge the intervals, so this
	// should never fail.
	llvm_unreachable("*** Couldn't join subrange!\n");
	}

	// The merging algorithm in LiveInterval::join() can't handle conflicting
	// value mappings, so we need to remove any live ranges that overlap a
	// CR_Replace resolution. Collect a set of end points that can be used to
	// restore the live range after joining.
	SmallVector<SlotIndex, 8> EndPoints;
	LHSVals.pruneValues(RHSVals, EndPoints, false);
	RHSVals.pruneValues(LHSVals, EndPoints, false);

	LHSVals.removeImplicitDefs();
	RHSVals.removeImplicitDefs();

	LRange.verify();
	RRange.verify();

	// Join RRange into LHS.
	LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
	NewVNInfo);

	DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
	if (EndPoints.empty())
	return;

	// Recompute the parts of the live range we had to remove because of
	// CR_Replace conflicts.
	DEBUG({
	dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
	for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
	dbgs() << EndPoints[i];
	if (i != n-1)
	dbgs() << ',';
	}
	dbgs() << ": " << LRange << '\n';
	});
	LIS->extendToIndices(LRange, EndPoints);
	}

	void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
	const LiveRange &ToMerge,
	LaneBitmask LaneMask,
	CoalescerPair &CP) {
	BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
	for (LiveInterval::SubRange &R : LI.subranges()) {
	LaneBitmask RMask = R.LaneMask;
	// LaneMask of subregisters common to subrange R and ToMerge.
	LaneBitmask Common = RMask & LaneMask;
	// There is nothing to do without common subregs.
	if (Common.none())
	continue;

	DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
	<< PrintLaneMask(Common) << '\n');
	// LaneMask of subregisters contained in the R range but not in ToMerge,
	// they have to split into their own subrange.
	LaneBitmask LRest = RMask & ~LaneMask;
	LiveInterval::SubRange *CommonRange;
	if (LRest.any()) {
	R.LaneMask = LRest;
	DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
	// Duplicate SubRange for newly merged common stuff.
	CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
	} else {
	// Reuse the existing range.
	R.LaneMask = Common;
	CommonRange = &R;
	}
	LiveRange RangeCopy(ToMerge, Allocator);
	joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
	LaneMask &= ~RMask;
	}

	if (LaneMask.any()) {
	DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
	LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
	}
	}

	bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
	SmallVector<VNInfo*, 16> NewVNInfo;
	LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
	LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
	bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC());
	JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), LaneBitmask::getNone(),
	NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
	JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), LaneBitmask::getNone(),
	NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);

	DEBUG(dbgs() << "\t\tRHS = " << RHS
	<< "\n\t\tLHS = " << LHS
	<< '\n');

	// First compute NewVNInfo and the simple value mappings.
	// Detect impossible conflicts early.
	if (!LHSVals.mapValues(RHSVals) \|\| !RHSVals.mapValues(LHSVals))
	return false;

	// Some conflicts can only be resolved after all values have been mapped.
	if (!LHSVals.resolveConflicts(RHSVals) \|\| !RHSVals.resolveConflicts(LHSVals))
	return false;

	// All clear, the live ranges can be merged.
	if (RHS.hasSubRanges() \|\| LHS.hasSubRanges()) {
	BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();

	// Transform lanemasks from the LHS to masks in the coalesced register and
	// create initial subranges if necessary.
	unsigned DstIdx = CP.getDstIdx();
	if (!LHS.hasSubRanges()) {
	LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
	: TRI->getSubRegIndexLaneMask(DstIdx);
	// LHS must support subregs or we wouldn't be in this codepath.
	assert(Mask.any());
	LHS.createSubRangeFrom(Allocator, Mask, LHS);
	} else if (DstIdx != 0) {
	// Transform LHS lanemasks to new register class if necessary.
	for (LiveInterval::SubRange &R : LHS.subranges()) {
	LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
	R.LaneMask = Mask;
	}
	}
	DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
	<< ' ' << LHS << '\n');

	// Determine lanemasks of RHS in the coalesced register and merge subranges.
	unsigned SrcIdx = CP.getSrcIdx();
	if (!RHS.hasSubRanges()) {
	LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
	: TRI->getSubRegIndexLaneMask(SrcIdx);
	mergeSubRangeInto(LHS, RHS, Mask, CP);
	} else {
	// Pair up subranges and merge.
	for (LiveInterval::SubRange &R : RHS.subranges()) {
	LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
	mergeSubRangeInto(LHS, R, Mask, CP);
	}
	}
	DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");

	// Pruning implicit defs from subranges may result in the main range
	// having stale segments.
	LHSVals.pruneMainSegments(LHS, ShrinkMainRange);

	LHSVals.pruneSubRegValues(LHS, ShrinkMask);
	RHSVals.pruneSubRegValues(LHS, ShrinkMask);
	}

	// The merging algorithm in LiveInterval::join() can't handle conflicting
	// value mappings, so we need to remove any live ranges that overlap a
	// CR_Replace resolution. Collect a set of end points that can be used to
	// restore the live range after joining.
	SmallVector<SlotIndex, 8> EndPoints;
	LHSVals.pruneValues(RHSVals, EndPoints, true);
	RHSVals.pruneValues(LHSVals, EndPoints, true);

	// Erase COPY and IMPLICIT_DEF instructions. This may cause some external
	// registers to require trimming.
	SmallVector<unsigned, 8> ShrinkRegs;
	LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs, &LHS);
	RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
	while (!ShrinkRegs.empty())
	shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));

	// Join RHS into LHS.
	LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);

	// Kill flags are going to be wrong if the live ranges were overlapping.
	// Eventually, we should simply clear all kill flags when computing live
	// ranges. They are reinserted after register allocation.
	MRI->clearKillFlags(LHS.reg);
	MRI->clearKillFlags(RHS.reg);

	if (!EndPoints.empty()) {
	// Recompute the parts of the live range we had to remove because of
	// CR_Replace conflicts.
	DEBUG({
	dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
	for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
	dbgs() << EndPoints[i];
	if (i != n-1)
	dbgs() << ',';
	}
	dbgs() << ": " << LHS << '\n';
	});
	LIS->extendToIndices((LiveRange&)LHS, EndPoints);
	}

	return true;
	}

	bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
	return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
	}

	namespace {
	/// Information concerning MBB coalescing priority.
	struct MBBPriorityInfo {
	MachineBasicBlock *MBB;
	unsigned Depth;
	bool IsSplit;

	MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
	: MBB(mbb), Depth(depth), IsSplit(issplit) {}
	};
	}

	/// C-style comparator that sorts first based on the loop depth of the basic
	/// block (the unsigned), and then on the MBB number.
	///
	/// EnableGlobalCopies assumes that the primary sort key is loop depth.
	static int compareMBBPriority(const MBBPriorityInfo *LHS,
	const MBBPriorityInfo *RHS) {
	// Deeper loops first
	if (LHS->Depth != RHS->Depth)
	return LHS->Depth > RHS->Depth ? -1 : 1;

	// Try to unsplit critical edges next.
	if (LHS->IsSplit != RHS->IsSplit)
	return LHS->IsSplit ? -1 : 1;

	// Prefer blocks that are more connected in the CFG. This takes care of
	// the most difficult copies first while intervals are short.
	unsigned cl = LHS->MBB->pred_size() + LHS->MBB->succ_size();
	unsigned cr = RHS->MBB->pred_size() + RHS->MBB->succ_size();
	if (cl != cr)
	return cl > cr ? -1 : 1;

	// As a last resort, sort by block number.
	return LHS->MBB->getNumber() < RHS->MBB->getNumber() ? -1 : 1;
	}

	/// \returns true if the given copy uses or defines a local live range.
	static bool isLocalCopy(MachineInstr Copy, const LiveIntervals LIS) {
	if (!Copy->isCopy())
	return false;

	if (Copy->getOperand(1).isUndef())
	return false;

	unsigned SrcReg = Copy->getOperand(1).getReg();
	unsigned DstReg = Copy->getOperand(0).getReg();
	if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
	\|\| TargetRegisterInfo::isPhysicalRegister(DstReg))
	return false;

	return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
	\|\| LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
	}

	bool RegisterCoalescer::
	copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
	bool Progress = false;
	for (unsigned i = 0, e = CurrList.size(); i != e; ++i) {
	if (!CurrList[i])
	continue;
	// Skip instruction pointers that have already been erased, for example by
	// dead code elimination.
	if (ErasedInstrs.erase(CurrList[i])) {
	CurrList[i] = nullptr;
	continue;
	}
	bool Again = false;
	bool Success = joinCopy(CurrList[i], Again);
	Progress \|= Success;
	if (Success \|\| !Again)
	CurrList[i] = nullptr;
	}
	return Progress;
	}

	/// Check if DstReg is a terminal node.
	/// I.e., it does not have any affinity other than \p Copy.
	static bool isTerminalReg(unsigned DstReg, const MachineInstr &Copy,
	const MachineRegisterInfo *MRI) {
	assert(Copy.isCopyLike());
	// Check if the destination of this copy as any other affinity.
	for (const MachineInstr &MI : MRI->reg_nodbg_instructions(DstReg))
	if (&MI != &Copy && MI.isCopyLike())
	return false;
	return true;
	}

	bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
	assert(Copy.isCopyLike());
	if (!UseTerminalRule)
	return false;
	unsigned DstReg, DstSubReg, SrcReg, SrcSubReg;
	isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg);
	// Check if the destination of this copy has any other affinity.
	if (TargetRegisterInfo::isPhysicalRegister(DstReg) \|\|
	// If SrcReg is a physical register, the copy won't be coalesced.
	// Ignoring it may have other side effect (like missing
	// rematerialization). So keep it.
	TargetRegisterInfo::isPhysicalRegister(SrcReg) \|\|
	!isTerminalReg(DstReg, Copy, MRI))
	return false;

	// DstReg is a terminal node. Check if it interferes with any other
	// copy involving SrcReg.
	const MachineBasicBlock *OrigBB = Copy.getParent();
	const LiveInterval &DstLI = LIS->getInterval(DstReg);
	for (const MachineInstr &MI : MRI->reg_nodbg_instructions(SrcReg)) {
	// Technically we should check if the weight of the new copy is
	// interesting compared to the other one and update the weight
	// of the copies accordingly. However, this would only work if
	// we would gather all the copies first then coalesce, whereas
	// right now we interleave both actions.
	// For now, just consider the copies that are in the same block.
	if (&MI == &Copy \|\| !MI.isCopyLike() \|\| MI.getParent() != OrigBB)
	continue;
	unsigned OtherReg, OtherSubReg, OtherSrcReg, OtherSrcSubReg;
	isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
	OtherSubReg);
	if (OtherReg == SrcReg)
	OtherReg = OtherSrcReg;
	// Check if OtherReg is a non-terminal.
	if (TargetRegisterInfo::isPhysicalRegister(OtherReg) \|\|
	isTerminalReg(OtherReg, MI, MRI))
	continue;
	// Check that OtherReg interfere with DstReg.
	if (LIS->getInterval(OtherReg).overlaps(DstLI)) {
	DEBUG(dbgs() << "Apply terminal rule for: " << PrintReg(DstReg) << '\n');
	return true;
	}
	}
	return false;
	}

	void
	RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
	DEBUG(dbgs() << MBB->getName() << ":\n");

	// Collect all copy-like instructions in MBB. Don't start coalescing anything
	// yet, it might invalidate the iterator.
	const unsigned PrevSize = WorkList.size();
	if (JoinGlobalCopies) {
	SmallVector<MachineInstr*, 2> LocalTerminals;
	SmallVector<MachineInstr*, 2> GlobalTerminals;
	// Coalesce copies bottom-up to coalesce local defs before local uses. They
	// are not inherently easier to resolve, but slightly preferable until we
	// have local live range splitting. In particular this is required by
	// cmp+jmp macro fusion.
	for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
	MII != E; ++MII) {
	if (!MII->isCopyLike())
	continue;
	bool ApplyTerminalRule = applyTerminalRule(*MII);
	if (isLocalCopy(&(*MII), LIS)) {
	if (ApplyTerminalRule)
	LocalTerminals.push_back(&(*MII));
	else
	LocalWorkList.push_back(&(*MII));
	} else {
	if (ApplyTerminalRule)
	GlobalTerminals.push_back(&(*MII));
	else
	WorkList.push_back(&(*MII));
	}
	}
	// Append the copies evicted by the terminal rule at the end of the list.
	LocalWorkList.append(LocalTerminals.begin(), LocalTerminals.end());
	WorkList.append(GlobalTerminals.begin(), GlobalTerminals.end());
	}
	else {
	SmallVector<MachineInstr*, 2> Terminals;
	for (MachineInstr &MII : *MBB)
	if (MII.isCopyLike()) {
	if (applyTerminalRule(MII))
	Terminals.push_back(&MII);
	else
	WorkList.push_back(&MII);
	}
	// Append the copies evicted by the terminal rule at the end of the list.
	WorkList.append(Terminals.begin(), Terminals.end());
	}
	// Try coalescing the collected copies immediately, and remove the nulls.
	// This prevents the WorkList from getting too large since most copies are
	// joinable on the first attempt.
	MutableArrayRef<MachineInstr*>
	CurrList(WorkList.begin() + PrevSize, WorkList.end());
	if (copyCoalesceWorkList(CurrList))
	WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
	(MachineInstr*)nullptr), WorkList.end());
	}

	void RegisterCoalescer::coalesceLocals() {
	copyCoalesceWorkList(LocalWorkList);
	for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
	if (LocalWorkList[j])
	WorkList.push_back(LocalWorkList[j]);
	}
	LocalWorkList.clear();
	}

	void RegisterCoalescer::joinAllIntervals() {
	DEBUG(dbgs() << "******** JOINING INTERVALS *********\n");
	assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");

	std::vector<MBBPriorityInfo> MBBs;
	MBBs.reserve(MF->size());
	for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
	MachineBasicBlock MBB = &I;
	MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
	JoinSplitEdges && isSplitEdge(MBB)));
	}
	array_pod_sort(MBBs.begin(), MBBs.end(), compareMBBPriority);

	// Coalesce intervals in MBB priority order.
	unsigned CurrDepth = UINT_MAX;
	for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
	// Try coalescing the collected local copies for deeper loops.
	if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) {
	coalesceLocals();
	CurrDepth = MBBs[i].Depth;
	}
	copyCoalesceInMBB(MBBs[i].MBB);
	}
	coalesceLocals();

	// Joining intervals can allow other intervals to be joined. Iteratively join
	// until we make no progress.
	while (copyCoalesceWorkList(WorkList))
	/* empty */ ;
	}

	void RegisterCoalescer::releaseMemory() {
	ErasedInstrs.clear();
	WorkList.clear();
	DeadDefs.clear();
	InflateRegs.clear();
	}

	bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
	MF = &fn;
	MRI = &fn.getRegInfo();
	TM = &fn.getTarget();
	const TargetSubtargetInfo &STI = fn.getSubtarget();
	TRI = STI.getRegisterInfo();
	TII = STI.getInstrInfo();
	LIS = &getAnalysis<LiveIntervals>();
	AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
	Loops = &getAnalysis<MachineLoopInfo>();
	if (EnableGlobalCopies == cl::BOU_UNSET)
	JoinGlobalCopies = STI.enableJoinGlobalCopies();
	else
	JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);

	// The MachineScheduler does not currently require JoinSplitEdges. This will
	// either be enabled unconditionally or replaced by a more general live range
	// splitting optimization.
	JoinSplitEdges = EnableJoinSplits;

	DEBUG(dbgs() << "******** SIMPLE REGISTER COALESCING ********\n"
	<< "********** Function: " << MF->getName() << '\n');

	if (VerifyCoalescing)
	MF->verify(this, "Before register coalescing");

	RegClassInfo.runOnMachineFunction(fn);

	// Join (coalesce) intervals if requested.
	if (EnableJoining)
	joinAllIntervals();

	// After deleting a lot of copies, register classes may be less constrained.
	// Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
	// DPR inflation.
	array_pod_sort(InflateRegs.begin(), InflateRegs.end());
	InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
	InflateRegs.end());
	DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
	for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
	unsigned Reg = InflateRegs[i];
	if (MRI->reg_nodbg_empty(Reg))
	continue;
	if (MRI->recomputeRegClass(Reg)) {
	DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
	<< TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
	++NumInflated;

	LiveInterval &LI = LIS->getInterval(Reg);
	if (LI.hasSubRanges()) {
	// If the inflated register class does not support subregisters anymore
	// remove the subranges.
	if (!MRI->shouldTrackSubRegLiveness(Reg)) {
	LI.clearSubRanges();
	} else {
	#ifndef NDEBUG
	LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
	// If subranges are still supported, then the same subregs
	// should still be supported.
	for (LiveInterval::SubRange &S : LI.subranges()) {
	assert((S.LaneMask & ~MaxMask).none());
	}
	#endif
	}
	}
	}
	}

	DEBUG(dump());
	if (VerifyCoalescing)
	MF->verify(this, "After register coalescing");
	return true;
	}

	void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
	LIS->print(O, m);
	}
	Index: projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp (revision 313643)
	@@ -1,572 +1,572 @@
	//===- MCCodeView.h - Machine Code CodeView support -------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Holds state from .cv_file and .cv_loc directives for later emission.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/MC/MCCodeView.h"
	#include "llvm/MC/MCAsmLayout.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/DebugInfo/CodeView/CodeView.h"
	#include "llvm/DebugInfo/CodeView/Line.h"
	#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCObjectStreamer.h"
	#include "llvm/MC/MCValue.h"
	#include "llvm/Support/COFF.h"
	#include "llvm/Support/EndianStream.h"

	using namespace llvm;
	using namespace llvm::codeview;

	CodeViewContext::CodeViewContext() {}

	CodeViewContext::~CodeViewContext() {
	// If someone inserted strings into the string table but never actually
	// emitted them somewhere, clean up the fragment.
	if (!InsertedStrTabFragment)
	delete StrTabFragment;
	}

	/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
	/// for it.
	bool CodeViewContext::isValidFileNumber(unsigned FileNumber) const {
	unsigned Idx = FileNumber - 1;
	if (Idx < Filenames.size())
	return !Filenames[Idx].empty();
	return false;
	}

	bool CodeViewContext::addFile(unsigned FileNumber, StringRef Filename) {
	assert(FileNumber > 0);
	Filename = addToStringTable(Filename);
	unsigned Idx = FileNumber - 1;
	if (Idx >= Filenames.size())
	Filenames.resize(Idx + 1);

	if (Filename.empty())
	Filename = "<stdin>";

	if (!Filenames[Idx].empty())
	return false;

	// FIXME: We should store the string table offset of the filename, rather than
	// the filename itself for efficiency.
	Filename = addToStringTable(Filename);

	Filenames[Idx] = Filename;
	return true;
	}

	bool CodeViewContext::recordFunctionId(unsigned FuncId) {
	if (FuncId >= Functions.size())
	Functions.resize(FuncId + 1);

	// Return false if this function info was already allocated.
	if (!Functions[FuncId].isUnallocatedFunctionInfo())
	return false;

	// Mark this as an allocated normal function, and leave the rest alone.
	Functions[FuncId].ParentFuncIdPlusOne = MCCVFunctionInfo::FunctionSentinel;
	return true;
	}

	bool CodeViewContext::recordInlinedCallSiteId(unsigned FuncId, unsigned IAFunc,
	unsigned IAFile, unsigned IALine,
	unsigned IACol) {
	if (FuncId >= Functions.size())
	Functions.resize(FuncId + 1);

	// Return false if this function info was already allocated.
	if (!Functions[FuncId].isUnallocatedFunctionInfo())
	return false;

	MCCVFunctionInfo::LineInfo InlinedAt;
	InlinedAt.File = IAFile;
	InlinedAt.Line = IALine;
	InlinedAt.Col = IACol;

	// Mark this as an inlined call site and record call site line info.
	MCCVFunctionInfo *Info = &Functions[FuncId];
	Info->ParentFuncIdPlusOne = IAFunc + 1;
	Info->InlinedAt = InlinedAt;

	// Walk up the call chain adding this function id to the InlinedAtMap of all
	// transitive callers until we hit a real function.
	while (Info->isInlinedCallSite()) {
	InlinedAt = Info->InlinedAt;
	Info = getCVFunctionInfo(Info->getParentFuncId());
	Info->InlinedAtMap[FuncId] = InlinedAt;
	}

	return true;
	}

	MCDataFragment *CodeViewContext::getStringTableFragment() {
	if (!StrTabFragment) {
	StrTabFragment = new MCDataFragment();
	// Start a new string table out with a null byte.
	StrTabFragment->getContents().push_back('\0');
	}
	return StrTabFragment;
	}

	StringRef CodeViewContext::addToStringTable(StringRef S) {
	SmallVectorImpl<char> &Contents = getStringTableFragment()->getContents();
	auto Insertion =
	StringTable.insert(std::make_pair(S, unsigned(Contents.size())));
	// Return the string from the table, since it is stable.
	S = Insertion.first->first();
	if (Insertion.second) {
	// The string map key is always null terminated.
	Contents.append(S.begin(), S.end() + 1);
	}
	return S;
	}

	unsigned CodeViewContext::getStringTableOffset(StringRef S) {
	// A string table offset of zero is always the empty string.
	if (S.empty())
	return 0;
	auto I = StringTable.find(S);
	assert(I != StringTable.end());
	return I->second;
	}

	void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
	MCContext &Ctx = OS.getContext();
	MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
	*StringEnd = Ctx.createTempSymbol("strtab_end", false);

	OS.EmitIntValue(unsigned(ModuleSubstreamKind::StringTable), 4);
	OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
	OS.EmitLabel(StringBegin);

	// Put the string table data fragment here, if we haven't already put it
	// somewhere else. If somebody wants two string tables in their .s file, one
	// will just be empty.
	if (!InsertedStrTabFragment) {
	OS.insert(getStringTableFragment());
	InsertedStrTabFragment = true;
	}

	OS.EmitValueToAlignment(4, 0);

	OS.EmitLabel(StringEnd);
	}

	void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
	// Do nothing if there are no file checksums. Microsoft's linker rejects empty
	// CodeView substreams.
	if (Filenames.empty())
	return;

	MCContext &Ctx = OS.getContext();
	MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
	*FileEnd = Ctx.createTempSymbol("filechecksums_end", false);

	OS.EmitIntValue(unsigned(ModuleSubstreamKind::FileChecksums), 4);
	OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
	OS.EmitLabel(FileBegin);

	// Emit an array of FileChecksum entries. We index into this table using the
	// user-provided file number. Each entry is currently 8 bytes, as we don't
	// emit checksums.
	for (StringRef Filename : Filenames) {
	OS.EmitIntValue(getStringTableOffset(Filename), 4);
	// Zero the next two fields and align back to 4 bytes. This indicates that
	// no checksum is present.
	OS.EmitIntValue(0, 4);
	}

	OS.EmitLabel(FileEnd);
	}

	void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
	unsigned FuncId,
	const MCSymbol *FuncBegin,
	const MCSymbol *FuncEnd) {
	MCContext &Ctx = OS.getContext();
	MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
	*LineEnd = Ctx.createTempSymbol("linetable_end", false);

	OS.EmitIntValue(unsigned(ModuleSubstreamKind::Lines), 4);
	OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
	OS.EmitLabel(LineBegin);
	OS.EmitCOFFSecRel32(FuncBegin, /Offset=/0);
	OS.EmitCOFFSectionIndex(FuncBegin);

	// Actual line info.
	std::vector<MCCVLineEntry> Locs = getFunctionLineEntries(FuncId);
	bool HaveColumns = any_of(Locs, [](const MCCVLineEntry &LineEntry) {
	return LineEntry.getColumn() != 0;
	});
	OS.EmitIntValue(HaveColumns ? int(LineFlags::HaveColumns) : 0, 2);
	OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);

	for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
	// Emit a file segment for the run of locations that share a file id.
	unsigned CurFileNum = I->getFileNum();
	auto FileSegEnd =
	std::find_if(I, E, [CurFileNum](const MCCVLineEntry &Loc) {
	return Loc.getFileNum() != CurFileNum;
	});
	unsigned EntryCount = FileSegEnd - I;
	OS.AddComment("Segment for file '" + Twine(Filenames[CurFileNum - 1]) +
	"' begins");
	OS.EmitIntValue(8 * (CurFileNum - 1), 4);
	OS.EmitIntValue(EntryCount, 4);
	uint32_t SegmentSize = 12;
	SegmentSize += 8 * EntryCount;
	if (HaveColumns)
	SegmentSize += 4 * EntryCount;
	OS.EmitIntValue(SegmentSize, 4);

	for (auto J = I; J != FileSegEnd; ++J) {
	OS.emitAbsoluteSymbolDiff(J->getLabel(), FuncBegin, 4);
	unsigned LineData = J->getLine();
	if (J->isStmt())
	LineData \|= LineInfo::StatementFlag;
	OS.EmitIntValue(LineData, 4);
	}
	if (HaveColumns) {
	for (auto J = I; J != FileSegEnd; ++J) {
	OS.EmitIntValue(J->getColumn(), 2);
	OS.EmitIntValue(0, 2);
	}
	}
	I = FileSegEnd;
	}
	OS.EmitLabel(LineEnd);
	}

	static bool compressAnnotation(uint32_t Data, SmallVectorImpl<char> &Buffer) {
	if (isUInt<7>(Data)) {
	Buffer.push_back(Data);
	return true;
	}

	if (isUInt<14>(Data)) {
	Buffer.push_back((Data >> 8) \| 0x80);
	Buffer.push_back(Data & 0xff);
	return true;
	}

	if (isUInt<29>(Data)) {
	Buffer.push_back((Data >> 24) \| 0xC0);
	Buffer.push_back((Data >> 16) & 0xff);
	Buffer.push_back((Data >> 8) & 0xff);
	Buffer.push_back(Data & 0xff);
	return true;
	}

	return false;
	}

	static bool compressAnnotation(BinaryAnnotationsOpCode Annotation,
	SmallVectorImpl<char> &Buffer) {
	return compressAnnotation(static_cast<uint32_t>(Annotation), Buffer);
	}

	static uint32_t encodeSignedNumber(uint32_t Data) {
	if (Data >> 31)
	return ((-Data) << 1) \| 1;
	return Data << 1;
	}

	void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS,
	unsigned PrimaryFunctionId,
	unsigned SourceFileId,
	unsigned SourceLineNum,
	const MCSymbol *FnStartSym,
	const MCSymbol *FnEndSym) {
	// Create and insert a fragment into the current section that will be encoded
	// later.
	new MCCVInlineLineTableFragment(PrimaryFunctionId, SourceFileId,
	SourceLineNum, FnStartSym, FnEndSym,
	OS.getCurrentSectionOnly());
	}

	void CodeViewContext::emitDefRange(
	MCObjectStreamer &OS,
	ArrayRef<std::pair<const MCSymbol , const MCSymbol >> Ranges,
	StringRef FixedSizePortion) {
	// Create and insert a fragment into the current section that will be encoded
	// later.
	new MCCVDefRangeFragment(Ranges, FixedSizePortion,
	OS.getCurrentSectionOnly());
	}

	static unsigned computeLabelDiff(MCAsmLayout &Layout, const MCSymbol *Begin,
	const MCSymbol *End) {
	MCContext &Ctx = Layout.getAssembler().getContext();
	MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
	const MCExpr *BeginRef = MCSymbolRefExpr::create(Begin, Variant, Ctx),
	*EndRef = MCSymbolRefExpr::create(End, Variant, Ctx);
	const MCExpr *AddrDelta =
	MCBinaryExpr::create(MCBinaryExpr::Sub, EndRef, BeginRef, Ctx);
	int64_t Result;
	bool Success = AddrDelta->evaluateKnownAbsolute(Result, Layout);
	assert(Success && "failed to evaluate label difference as absolute");
	(void)Success;
	assert(Result >= 0 && "negative label difference requested");
	assert(Result < UINT_MAX && "label difference greater than 2GB");
	return unsigned(Result);
	}

	void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
	MCCVInlineLineTableFragment &Frag) {
	size_t LocBegin;
	size_t LocEnd;
	std::tie(LocBegin, LocEnd) = getLineExtent(Frag.SiteFuncId);

	// Include all child inline call sites in our .cv_loc extent.
	MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(Frag.SiteFuncId);
	for (auto &KV : SiteInfo->InlinedAtMap) {
	unsigned ChildId = KV.first;
	auto Extent = getLineExtent(ChildId);
	LocBegin = std::min(LocBegin, Extent.first);
	LocEnd = std::max(LocEnd, Extent.second);
	}

	if (LocBegin >= LocEnd)
	return;
	ArrayRef<MCCVLineEntry> Locs = getLinesForExtent(LocBegin, LocEnd);
	if (Locs.empty())
	return;

	// Make an artificial start location using the function start and the inlinee
	// lines start location information. All deltas start relative to this
	// location.
	MCCVLineEntry StartLoc(Frag.getFnStartSym(), MCCVLoc(Locs.front()));
	StartLoc.setFileNum(Frag.StartFileId);
	StartLoc.setLine(Frag.StartLineNum);
	bool HaveOpenRange = false;

	const MCSymbol *LastLabel = Frag.getFnStartSym();
	MCCVFunctionInfo::LineInfo LastSourceLoc, CurSourceLoc;
	LastSourceLoc.File = Frag.StartFileId;
	LastSourceLoc.Line = Frag.StartLineNum;

	SmallVectorImpl<char> &Buffer = Frag.getContents();
	Buffer.clear(); // Clear old contents if we went through relaxation.
	for (const MCCVLineEntry &Loc : Locs) {
	// Exit early if our line table would produce an oversized InlineSiteSym
	// record. Account for the ChangeCodeLength annotation emitted after the
	// loop ends.
	constexpr uint32_t InlineSiteSize = 12;
	constexpr uint32_t AnnotationSize = 8;
	size_t MaxBufferSize = MaxRecordLength - InlineSiteSize - AnnotationSize;
	if (Buffer.size() >= MaxBufferSize)
	break;

	if (Loc.getFunctionId() == Frag.SiteFuncId) {
	CurSourceLoc.File = Loc.getFileNum();
	CurSourceLoc.Line = Loc.getLine();
	} else {
	auto I = SiteInfo->InlinedAtMap.find(Loc.getFunctionId());
	if (I != SiteInfo->InlinedAtMap.end()) {
	// This .cv_loc is from a child inline call site. Use the source
	// location of the inlined call site instead of the .cv_loc directive
	// source location.
	CurSourceLoc = I->second;
	} else {
	// We've hit a cv_loc not attributed to this inline call site. Use this
	// label to end the PC range.
	if (HaveOpenRange) {
	unsigned Length = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
	compressAnnotation(Length, Buffer);
	LastLabel = Loc.getLabel();
	}
	HaveOpenRange = false;
	continue;
	}
	}

	// Skip this .cv_loc if we have an open range and this isn't a meaningful
	// source location update. The current table format does not support column
	// info, so we can skip updates for those.
	if (HaveOpenRange && CurSourceLoc.File == LastSourceLoc.File &&
	CurSourceLoc.Line == LastSourceLoc.Line)
	continue;

	HaveOpenRange = true;

	if (CurSourceLoc.File != LastSourceLoc.File) {
	// File ids are 1 based, and each file checksum table entry is 8 bytes
	// long. See emitFileChecksums above.
	unsigned FileOffset = 8 * (CurSourceLoc.File - 1);
	compressAnnotation(BinaryAnnotationsOpCode::ChangeFile, Buffer);
	compressAnnotation(FileOffset, Buffer);
	}

	int LineDelta = CurSourceLoc.Line - LastSourceLoc.Line;
	unsigned EncodedLineDelta = encodeSignedNumber(LineDelta);
	unsigned CodeDelta = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	if (CodeDelta == 0 && LineDelta != 0) {
	compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
	compressAnnotation(EncodedLineDelta, Buffer);
	} else if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
	// The ChangeCodeOffsetAndLineOffset combination opcode is used when the
	// encoded line delta uses 3 or fewer set bits and the code offset fits
	// in one nibble.
	unsigned Operand = (EncodedLineDelta << 4) \| CodeDelta;
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset,
	Buffer);
	compressAnnotation(Operand, Buffer);
	} else {
	// Otherwise use the separate line and code deltas.
	if (LineDelta != 0) {
	compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
	compressAnnotation(EncodedLineDelta, Buffer);
	}
	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffset, Buffer);
	compressAnnotation(CodeDelta, Buffer);
	}

	LastLabel = Loc.getLabel();
	LastSourceLoc = CurSourceLoc;
	}

	assert(HaveOpenRange);

	unsigned EndSymLength =
	computeLabelDiff(Layout, LastLabel, Frag.getFnEndSym());
	unsigned LocAfterLength = ~0U;
	ArrayRef<MCCVLineEntry> LocAfter = getLinesForExtent(LocEnd, LocEnd + 1);
	if (!LocAfter.empty()) {
	// Only try to compute this difference if we're in the same section.
	const MCCVLineEntry &Loc = LocAfter[0];
	if (&Loc.getLabel()->getSection(false) == &LastLabel->getSection(false))
	LocAfterLength = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
	}

	compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
	compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
	}

	void CodeViewContext::encodeDefRange(MCAsmLayout &Layout,
	MCCVDefRangeFragment &Frag) {
	MCContext &Ctx = Layout.getAssembler().getContext();
	SmallVectorImpl<char> &Contents = Frag.getContents();
	Contents.clear();
	SmallVectorImpl<MCFixup> &Fixups = Frag.getFixups();
	Fixups.clear();
	raw_svector_ostream OS(Contents);

	// Compute all the sizes up front.
	SmallVector<std::pair<unsigned, unsigned>, 4> GapAndRangeSizes;
	const MCSymbol *LastLabel = nullptr;
	for (std::pair<const MCSymbol , const MCSymbol > Range : Frag.getRanges()) {
	unsigned GapSize =
	LastLabel ? computeLabelDiff(Layout, LastLabel, Range.first) : 0;
	unsigned RangeSize = computeLabelDiff(Layout, Range.first, Range.second);
	GapAndRangeSizes.push_back({GapSize, RangeSize});
	LastLabel = Range.second;
	}

	// Write down each range where the variable is defined.
	for (size_t I = 0, E = Frag.getRanges().size(); I != E;) {
	// If the range size of multiple consecutive ranges is under the max,
	// combine the ranges and emit some gaps.
	const MCSymbol *RangeBegin = Frag.getRanges()[I].first;
	unsigned RangeSize = GapAndRangeSizes[I].second;
	size_t J = I + 1;
	for (; J != E; ++J) {
	unsigned GapAndRangeSize = GapAndRangeSizes[J].first + GapAndRangeSizes[J].second;
	if (RangeSize + GapAndRangeSize > MaxDefRange)
	break;
	RangeSize += GapAndRangeSize;
	}
	unsigned NumGaps = J - I - 1;

	support::endian::Writer<support::little> LEWriter(OS);

	unsigned Bias = 0;
	// We must split the range into chunks of MaxDefRange, this is a fundamental
	// limitation of the file format.
	do {
	uint16_t Chunk = std::min((uint32_t)MaxDefRange, RangeSize);

	const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(RangeBegin, Ctx);
	const MCBinaryExpr *BE =
	MCBinaryExpr::createAdd(SRE, MCConstantExpr::create(Bias, Ctx), Ctx);
	MCValue Res;
	BE->evaluateAsRelocatable(Res, &Layout, /Fixup=/nullptr);

	// Each record begins with a 2-byte number indicating how large the record
	// is.
	StringRef FixedSizePortion = Frag.getFixedSizePortion();
	// Our record is a fixed sized prefix and a LocalVariableAddrRange that we
	// are artificially constructing.
	size_t RecordSize = FixedSizePortion.size() +
	sizeof(LocalVariableAddrRange) + 4 * NumGaps;
	- // Write out the recrod size.
	- support::endian::Writer<support::little>(OS).write<uint16_t>(RecordSize);
	+ // Write out the record size.
	+ LEWriter.write<uint16_t>(RecordSize);
	// Write out the fixed size prefix.
	OS << FixedSizePortion;
	// Make space for a fixup that will eventually have a section relative
	// relocation pointing at the offset where the variable becomes live.
	Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_4));
	- Contents.resize(Contents.size() + 4); // Fixup for code start.
	+ LEWriter.write<uint32_t>(0); // Fixup for code start.
	// Make space for a fixup that will record the section index for the code.
	Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_2));
	- Contents.resize(Contents.size() + 2); // Fixup for section index.
	+ LEWriter.write<uint16_t>(0); // Fixup for section index.
	// Write down the range's extent.
	LEWriter.write<uint16_t>(Chunk);

	// Move on to the next range.
	Bias += Chunk;
	RangeSize -= Chunk;
	} while (RangeSize > 0);

	// Emit the gaps afterwards.
	- assert((NumGaps == 0 \|\| Bias < MaxDefRange) &&
	+ assert((NumGaps == 0 \|\| Bias <= MaxDefRange) &&
	"large ranges should not have gaps");
	unsigned GapStartOffset = GapAndRangeSizes[I].second;
	for (++I; I != J; ++I) {
	unsigned GapSize, RangeSize;
	assert(I < GapAndRangeSizes.size());
	std::tie(GapSize, RangeSize) = GapAndRangeSizes[I];
	LEWriter.write<uint16_t>(GapStartOffset);
	- LEWriter.write<uint16_t>(RangeSize);
	+ LEWriter.write<uint16_t>(GapSize);
	GapStartOffset += GapSize + RangeSize;
	}
	}
	}

	//
	// This is called when an instruction is assembled into the specified section
	// and if there is information from the last .cv_loc directive that has yet to have
	// a line entry made for it is made.
	//
	void MCCVLineEntry::Make(MCObjectStreamer *MCOS) {
	CodeViewContext &CVC = MCOS->getContext().getCVContext();
	if (!CVC.getCVLocSeen())
	return;

	// Create a symbol at in the current section for use in the line entry.
	MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
	// Set the value of the symbol to use for the MCCVLineEntry.
	MCOS->EmitLabel(LineSym);

	// Get the current .loc info saved in the context.
	const MCCVLoc &CVLoc = CVC.getCurrentCVLoc();

	// Create a (local) line entry with the symbol and the current .loc info.
	MCCVLineEntry LineEntry(LineSym, CVLoc);

	// clear CVLocSeen saying the current .loc info is now used.
	CVC.clearCVLocSeen();

	// Add the line entry to this section's entries.
	CVC.addLineEntry(LineEntry);
	}
	Index: projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 313643)
	@@ -1,10714 +1,10715 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64CallingConvention.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64ISelLowering.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetCallingConv.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");

	static cl::opt<bool>
	EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
	cl::desc("Allow AArch64 SLI/SRI formation"),
	cl::init(false));

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// f16 is a storage-only type, always promote it to f32.
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);

	// v4f16 is also a storage-only type, so promote it to v4f32 when that is
	// known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);

	// Expand all other v4f16 operations.
	// FIXME: We could generate better code by promoting some operations to
	// a pair of v4f32s
	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);


	// v8f16 is also a storage-only type, so expand it.
	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINNAN, Ty, Legal);
	setOperationAction(ISD::FMAXNAN, Ty, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (Subtarget->isTargetMachO()) {
	// For iOS, we don't want to the normal expansion of a libcall to
	// sincos. We want to issue a libcall to __sincos_stret to avoid memory
	// traffic.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);

	MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;

	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	// Enable TBZ/TBNZ
	MaskAndBranchFoldingIsLegal = true;
	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(2);
	// Set preferred alignments.
	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
	setPrefLoopAlignment(STI.getPrefLoopAlignment());

	// Only change the limit for entries in a jump table if specified by
	// the subtarget, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == 0)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
	// i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
	// -> v8f16 conversions.
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
	setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
	setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
	setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
	setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
	setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
	setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);

	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f16) {
	setOperationAction(ISD::LOAD, VT, Promote);
	AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);

	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
	} else if (VT == MVT::v2f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f16) {
	setOperationAction(ISD::LOAD, VT, Promote);
	AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);

	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOWI, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::AND, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Expand);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types (not f16 though!).
	if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
	for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
	ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them in the
	/// KnownZero/KnownOne bitsets.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, APInt &KnownZero, APInt &KnownOne,
	const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	APInt KnownZero2, KnownOne2;
	DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
	DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
	KnownZero &= KnownZero2;
	KnownOne &= KnownOne2;
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = KnownOne.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	KnownZero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = KnownZero.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	KnownZero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	KnownZero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned AddrSpace,
	unsigned Align,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER: break;
	case AArch64ISD::CALL: return "AArch64ISD::CALL";
	case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
	case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
	case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
	case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
	case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
	case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
	case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
	case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
	case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
	case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
	case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
	case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
	case AArch64ISD::ADC: return "AArch64ISD::ADC";
	case AArch64ISD::SBC: return "AArch64ISD::SBC";
	case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
	case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
	case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
	case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
	case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
	case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
	case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
	case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
	case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
	case AArch64ISD::DUP: return "AArch64ISD::DUP";
	case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
	case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
	case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
	case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
	case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
	case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
	case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
	case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
	case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
	case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
	case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
	case AArch64ISD::BICi: return "AArch64ISD::BICi";
	case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
	case AArch64ISD::BSL: return "AArch64ISD::BSL";
	case AArch64ISD::NEG: return "AArch64ISD::NEG";
	case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
	case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
	case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
	case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
	case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
	case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
	case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
	case AArch64ISD::REV16: return "AArch64ISD::REV16";
	case AArch64ISD::REV32: return "AArch64ISD::REV32";
	case AArch64ISD::REV64: return "AArch64ISD::REV64";
	case AArch64ISD::EXT: return "AArch64ISD::EXT";
	case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
	case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
	case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
	case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
	case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
	case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
	case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
	case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
	case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
	case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
	case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
	case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
	case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
	case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
	case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
	case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
	case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
	case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
	case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
	case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
	case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
	case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
	case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
	case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
	case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
	case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
	case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
	case AArch64ISD::NOT: return "AArch64ISD::NOT";
	case AArch64ISD::BIT: return "AArch64ISD::BIT";
	case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
	case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
	case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
	case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
	case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
	case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
	case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
	case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
	case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
	case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
	case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
	case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
	case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
	case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
	case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
	case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
	case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
	case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
	case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
	case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
	case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
	case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
	case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
	case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
	case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
	case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
	case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
	case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
	case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
	case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
	case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
	case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
	case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
	case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
	case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
	case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
	case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
	case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
	case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
	case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
	case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
	case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
	case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
	case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
	}
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	unsigned DestReg = MI.getOperand(0).getReg();
	unsigned IfTrueReg = MI.getOperand(1).getReg();
	unsigned IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	return (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.

	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
	!isUnsignedIntSetCC(CC)) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	Opcode = AArch64ISD::ANDS;
	RHS = LHS.getOperand(1);
	LHS = LHS.getOperand(0);
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// In general we can create code for arbitrary "... (and (and A B) C)"
	/// sequences. We can also implement some "or" expressions, because "(or A B)"
	/// is equivalent to "not (and (not A) (not B))" and we can implement some
	/// negation operations:
	/// We can negate the results of a single comparison by inverting the flags
	/// used when the predicate fails and inverting the flags tested in the next
	/// instruction; We can also negate the results of the whole previous
	/// conditional compare sequence by inverting the flags tested in the next
	/// instruction. However there is no way to negate the result of a partial
	/// sequence.
	///
	/// Therefore on encountering an "or" expression we can negate the subtree on
	/// one side and have to be able to push the negate to the leafs of the subtree
	/// on the other side (see also the comments in code). As complete example:
	/// "or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// is transformed to
	/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// and implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
	/// by conditional compare sequences.
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
	/// CanPushNegate is set to true if we can push a negate operation through
	/// the tree in a was that we are left with AND operations and negate operations
	/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
	/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
	/// brought into such a form.
	static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
	return false;
	bool CanNegateR;
	if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
	return false;

	if (Opcode == ISD::OR) {
	// For an OR expression we need to be able to negate at least one side or
	// we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// We can however change a (not (or x y)) to (and (not x) (not y)) if we
	// can negate the x and y subtrees.
	CanNegate = CanNegateL && CanNegateR;
	} else {
	// If the operands are OR expressions then we finally need to negate their
	// outputs, we can only do that for the operand with emitted last by
	// negating OutCC, not for both operands.
	bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
	bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
	if (NeedsNegOutL && NeedsNegOutR)
	return false;
	// We cannot negate an AND operation (it would become an OR),
	CanNegate = false;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// On recursive invocations @p PushNegate may be set to true to have negation
	/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
	/// for the comparisons in the current subtree; @p Depth limits the search
	/// depth to avoid stack overflow.
	static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, isInteger);
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert((Opcode == ISD::AND \|\| (Opcode == ISD::OR && Val->hasOneUse())) &&
	"Valid conjunction/disjunction tree");

	// Check if both sides can be transformed.
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);

	// In case of an OR we need to negate our operands and the result.
	// (A v B) <=> not(not(A) ^ not(B))
	bool NegateOpsAndResult = Opcode == ISD::OR;
	// We can negate the results of all previous operations by inverting the
	// predicate flags giving us a free negation for one side. The other side
	// must be negatable by itself.
	if (NegateOpsAndResult) {
	// See which side we can negate.
	bool CanNegateL;
	bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
	assert(isValidL && "Valid conjunction/disjunction tree");
	(void)isValidL;

	#ifndef NDEBUG
	bool CanNegateR;
	bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
	assert(isValidR && "Valid conjunction/disjunction tree");
	assert((CanNegateL \|\| CanNegateR) && "Valid conjunction/disjunction tree");
	#endif

	// Order the side which we cannot negate to RHS so we can emit it first.
	if (!CanNegateL)
	std::swap(LHS, RHS);
	} else {
	bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
	assert((!NeedsNegOutL \|\| RHS->getOpcode() != ISD::OR) &&
	"Valid conjunction/disjunction tree");
	// Order the side where we need to negate the output flags to RHS so it
	// gets emitted first.
	if (NeedsNegOutL)
	std::swap(LHS, RHS);
	}

	// Emit RHS. If we want to negate the tree we only need to push a negate
	// through if we are already in a PushNegate case, otherwise we can negate
	// the "flags to test" afterwards.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
	CCOp, Predicate);
	if (NegateOpsAndResult && !Negate)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	// Emit LHS. We may need to negate it.
	SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
	NegateOpsAndResult, CmpR,
	RHSCC);
	// If we transformed an OR to and AND then we have to negate the result
	// (or absorb the Negate parameter).
	if (NegateOpsAndResult && !Negate)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// \see emitConjunctionDisjunctionTreeRec().
	static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool CanNegate;
	if (!isConjunctionDisjunctionTree(Val, CanNegate))
	return SDValue();

	return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
	AArch64CC::AL);
	}

	/// @}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}
	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);

	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);
	SDLoc dl(Sel);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	SDValue SrcVal = Op.getOperand(0);
	return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /isSigned/ false,
	SDLoc(Op)).first;
	}

	static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 vectors are promoted to f32 before a conversion.
	if (InVT.getVectorElementType() == MVT::f16) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32.
	if (Op.getOperand(0).getValueType() == MVT::f16) {
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
	}

	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());

	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	// f16 conversions are promoted to f32.
	if (Op.getValueType() == MVT::f16) {
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (Op.getOperand(0).getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.isSExt = false;
	Entry.isZExt = false;
	Args.push_back(Entry);

	const char *LibcallName =
	(ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
	.setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	if (Op.getValueType() != MVT::f16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND)
	return true;
	if (isExtendedBUILD_VECTOR(N, DAG, true))
	return true;
	return false;
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::ZERO_EXTEND)
	return true;
	if (isExtendedBUILD_VECTOR(N, DAG, false))
	return true;
	return false;
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::AND:
	return LowerVectorAND(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	}
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "AArch64GenCallingConv.inc"

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();

	SDValue ArgValue;
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
	// nodes after our lowering.
	assert(RegVT == Ins[i].VT && "incorrect register location selected");
	break;
	}

	InVals.push_back(ArgValue);

	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue ArgValue;

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	InVals.push_back(ArgValue);
	}
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin()) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment.
	StackOffset = ((StackOffset + 7) & ~7);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8()) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	SDValue Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function *CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF->getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF->arg_begin(),
	e = CallerF->arg_end();
	i != e; ++i)
	if (i->hasByValAttr())
	return false;

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
	true),
	DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[realArgIdx].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc()) {
	if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false,
	DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	bool InternalLinkage = GV->hasInternalLinkage();
	if (InternalLinkage)
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	else {
	Callee =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	}
	} else if (ExternalSymbolSDNode *S =
	dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	}
	} else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned char OpFlags =
	Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catched the large code model case for Darwin.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
	}

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, PtrVT,
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 \| MO_NC),
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 \| MO_NC),
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 \| MO_NC));
	} else {
	// Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
	// the only correct model on Darwin.
	SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
	OpFlags \| AArch64II::MO_PAGE);
	unsigned char LoFlags = OpFlags \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC;
	SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);

	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
	}
	}

	/// \brief Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	MVT::i64, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 8,
	MachineMemOperand::MONonTemporal \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const uint32_t *Mask =
	Subtarget->getRegisterInfo()->getTLSCallPreservedMask();

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
	assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
	"ELF TLS only supported in small memory model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add -mtls-size command line option and make it control the 16MiB
	// vs. 4GiB code sequence generation.
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	SDValue TPWithOff_lo =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	SDValue TPWithOff =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return TPWithOff;
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	else if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	unsigned Opc = LHS.getOpcode();
	if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO)) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	"Unexpected condition code.");
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	EVT EltVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;
	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	EltVT = MVT::i32;
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;

	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	EltVT = MVT::i64;
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp =
	getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	unsigned Opc = CCVal.getOpcode();
	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (CCVal.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, PtrVT,
	DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
	DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 \| MO_NC),
	DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 \| MO_NC),
	DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
	AArch64II::MO_G0 \| MO_NC));
	}

	SDValue Hi =
	DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
	SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	SDValue GotAddr = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
	AArch64II::MO_GOT);
	return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
	}

	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, PtrVT,
	DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
	CP->getOffset(), AArch64II::MO_G3),
	DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
	CP->getOffset(), AArch64II::MO_G2 \| MO_NC),
	DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
	CP->getOffset(), AArch64II::MO_G1 \| MO_NC),
	DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
	CP->getOffset(), AArch64II::MO_G0 \| MO_NC));
	} else {
	// Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
	// ELF, the only valid one on Darwin.
	SDValue Hi =
	DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
	CP->getOffset(), AArch64II::MO_PAGE);
	SDValue Lo = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, PtrVT,
	DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
	DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 \| MO_NC),
	DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 \| MO_NC),
	DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 \| MO_NC));
	} else {
	SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
	SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF \|
	AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
	}
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
	: LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
	Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	8, false, false, false, MachinePointerInfo(DestSV),
	MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	unsigned Align = Op.getConstantOperandVal(3);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);

	if (Align > 8) {
	assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align, DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = 8;
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("sp", AArch64::SP)
	.Default(0);
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// The AArch64 target doesn't support folding offsets into global addresses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	if (Imm.isPosZero() && (VT == MVT::f64 \|\| VT == MVT::f32))
	return true;

	if (VT == MVT::f64)
	return AArch64_AM::getFP64Imm(Imm) != -1;
	else if (VT == MVT::f32)
	return AArch64_AM::getFP32Imm(Imm) != -1;
	return false;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	&Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
	}

	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, &Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense, e.g. S may be difficult to support.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'z':
	return C_Other;
	case 'x':
	case 'w':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	// A shuffle can only come from building a vector from various
	// elements of other vectors, provided their indices are constant.
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	// Currently only do something sane when at most two source vectors
	// are involved.
	if (Sources.size() > 2)
	return SDValue();

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	// Span too large for a VEXT to cope
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	DEBUG(
	for (auto Src : Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT);
	);

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT))
	return SDValue();

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
	// to make a vector of the same size as this SHUFFLE. We can ignore the
	// extract entirely, and canonicalise the concat using WidenVector.
	if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64)
	V1 = WidenVector(V1, DAG);

	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
	SelectionDAG &DAG) const {
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	SDValue LHS = Op.getOperand(0);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	if (!BVN)
	return Op;

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We only have BIC vector immediate instruction, which is and-not.
	CnstBits = ~CnstBits;

	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = ~UndefBits;
	goto AttemptModImm;
	}

	// We can always fall back to a non-immediate AND.
	FailedModImm:
	return Op;
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
	// Also, logical shift right -> sri, with the same structure.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	// Is the first op an AND?
	const SDValue And = N->getOperand(0);
	if (And.getOpcode() != ISD::AND)
	return SDValue();

	// Is the second op an shl or lshr?
	SDValue Shift = N->getOperand(1);
	// This will have been turned into: AArch64ISD::VSHL vector, #shift
	// or AArch64ISD::VLSHR vector, #shift
	unsigned ShiftOpc = Shift.getOpcode();
	if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
	return SDValue();
	bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	// Is the and mask vector all constant?
	uint64_t C1;
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();

	// Is C1 == ~C2, taking into account how much one can shift elements of a
	// particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();
	unsigned ElemMask = (1 << ElemSizeInBits) - 1;
	if ((C1 & ElemMask) != (~C2 & ElemMask))
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Intrin =
	IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
	SDValue ResultSLI =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
	Shift.getOperand(1));

	DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	DEBUG(N->dump(&DAG));
	DEBUG(dbgs() << "into: \n");
	DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (EnableAArch64SlrGeneration) {
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;
	}

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	SDValue LHS = Op.getOperand(1);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	// OR commutes, so try swapping the operands.
	if (!BVN) {
	LHS = Op.getOperand(0);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	}
	if (!BVN)
	return Op;

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = UndefBits;
	goto AttemptModImm;
	}

	// We can always fall back to a non-immediate OR.
	FailedModImm:
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	Op = NormalizeBuildVector(Op, DAG);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	// Certain magic vector constants (used to express things like NOT
	// and NEG) are passed through unmodified. This allows codegen patterns
	// for these operations to match. Special-purpose patterns will lower
	// these immediates to MOVIs if it proves necessary.
	if (VT.isInteger() && (CnstVal == 0 \|\| CnstVal == ~0ULL))
	return Op;

	// The many faces of MOVI...
	if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
	if (VT.getSizeInBits() == 128) {
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// Support the V64 version via subregister insertion.
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(264, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(272, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// The few faces of FMOV...
	if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
	SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
	VT.getSizeInBits() == 128) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
	SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// The many faces of MVNI...
	CnstVal = ~CnstVal;
	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(264, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(272, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = UndefBits;
	goto AttemptModImm;
	}
	FailedModImm:

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode())
	return DAG.getUNDEF(VT);

	if (isOnlyLowElement)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT)
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	// DUPLANE works on 128-bit vectors, widen it if necessary.
	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64)
	Value = WidenVector(Value, DAG);

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::f32 \|\| EltTy == MVT::f64) &&
	"Unsupported floating-point vector type");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	}
	return Val;
	}

	// If all elements are constants and the case above didn't get hit, fall back
	// to the default expansion, which will generate a load from the constant
	// pool.
	if (isConstant)
	return SDValue();

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned ElemSize = VT.getScalarSizeInBits();
	unsigned i = 0;
	// For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register.
	// Do not do this for UNDEF/LOAD nodes because we have better patterns
	// for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
	if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
	(ElemSize == 32 \|\| ElemSize == 64)) {
	unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
	MachineSDNode *N =
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
	DAG.getTargetConstant(SubIdx, dl, MVT::i32));
	Vec = SDValue(N, 0);
	++i;
	}
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	// Just use the default expansion. We failed to find a better alternative.
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getOperand(0).getValueType();
	SDLoc dl(Op);
	// Just in case...
	if (!VT.isVector())
	return SDValue();

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Cst)
	return SDValue();
	unsigned Val = Cst->getZExtValue();

	unsigned Size = Op.getValueSizeInBits();

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Val == 0)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
	EVT VT) const {
	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	if (LHS.getValueType().getVectorElementType() == MVT::f16)
	return SDValue();

	assert(LHS.getValueType().getVectorElementType() == MVT::f32 \|\|
	LHS.getValueType().getVectorElementType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	Info.vol = false; // volatile loads with NEON intrinsics not supported
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	Info.vol = false; // volatile stores with NEON intrinsics not supported
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.vol = true;
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.vol = true;
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 16;
	Info.vol = true;
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = 16;
	Info.vol = true;
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	default:
	break;
	}

	return false;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (I->getNumUses() != 1)
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const DataLayout &DL = I->getModule()->getDataLayout();
	EVT VT = getValueType(DL, User->getOperand(0)->getType());

	return !(isFMAFasterThanFMulAndFAdd(VT) &&
	isOperationLegalOrCustom(ISD::FMA, VT) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are next free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	unsigned &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = 0;
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// \brief Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VecTy = Shuffles[0]->getType();
	unsigned VecSize = DL.getTypeSizeInBits(VecTy);

	// Skip if we do not have NEON and skip illegal vector types.
	if (!Subtarget->hasNEON() \|\| (VecSize != 64 && VecSize != 128))
	return false;

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = VecTy->getVectorElementType();
	if (EltTy->isPointerTy())
	VecTy =
	VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());

	Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {VecTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	IRBuilder<> Builder(LI);
	Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);

	CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");

	// Replace uses of each shufflevector with the corresponding vector loaded
	// by ldN.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());

	SVI->replaceAllUsesWith(SubVec);
	}

	return true;
	}

	/// \brief Get a mask consisting of sequential integers starting from \p Start.
	///
	/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
	static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
	unsigned NumElts) {
	SmallVector<Constant *, 16> Mask;
	for (unsigned i = 0; i < NumElts; i++)
	Mask.push_back(Builder.getInt32(Start + i));

	return ConstantVector::get(Mask);
	}

	/// \brief Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	VectorType *VecTy = SVI->getType();
	assert(VecTy->getVectorNumElements() % Factor == 0 &&
	"Invalid interleaved store");

	unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
	Type *EltTy = VecTy->getVectorElementType();
	VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();
	unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);

	// Skip if we do not have NEON and skip illegal vector types.
	if (!Subtarget->hasNEON() \|\| (SubVecSize != 64 && SubVecSize != 128))
	return false;

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts =
	dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();

	// Convert to the corresponding integer vector.
	Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = VectorType::get(IntTy, LaneLen);
	}

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	auto Mask = SVI->getShuffleMask();
	for (unsigned i = 0; i < Factor; i++) {
	if (Mask[i] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	if (Mask[j*Factor + i] >= 0) {
	StartMask = Mask[j*Factor + i] - j;
	break;
	}
	}
	// Note: If all elements in a chunk are undefs, StartMask=0!
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
	}
	}

	Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	return true;
	}

	static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
	unsigned AlignCheck) {
	return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
	(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
	unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	// Don't use AdvSIMD to implement 16-byte memset. It would have taken one
	// instruction to materialize the v2i64 zero and one store (with restrictive
	// addressing mode). Just do two i64 store of zero-registers.
	bool Fast;
	const Function *F = MF.getFunction();
	if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
	!F->hasFnAttribute(Attribute::NoImplicitFloat) &&
	(memOpAlign(SrcAlign, DstAlign, 16) \|\|
	(allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
	return MVT::f128;

	if (Size >= 8 &&
	(memOpAlign(SrcAlign, DstAlign, 8) \|\|
	(allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
	return MVT::i64;

	if (Size >= 4 &&
	(memOpAlign(SrcAlign, DstAlign, 4) \|\|
	(allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
	return MVT::i32;

	return MVT::Other;
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	// Avoid UB for INT64_MIN.
	if (Immed == std::numeric_limits<int64_t>::min())
	return false;
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	return ((Immed >> 12) == 0 \|\| ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	std::vector<SDNode > Created) const {
	AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	if (Created) {
	Created->push_back(Cmp.getNode());
	Created->push_back(Add.getNode());
	Created->push_back(CSel.getNode());
	}

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	if (Created)
	Created->push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	SDValue N0 = N->getOperand(0);
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
	/// EXTR. Can't quite be done in TableGen because the two immediates aren't
	/// independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performBitcastCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Remove extraneous bitcasts around an extract_subvector.
	// For example,
	// (v4i16 (bitconvert
	// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
	// becomes
	// (extract_subvector ((v8i16 ...), (i64 4)))

	// Only interested in 64-bit vectors as the ultimate result.
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();
	if (VT.getSimpleVT().getSizeInBits() != 64)
	return SDValue();
	// Is the operand an extract_subvector starting at the beginning or halfway
	// point of the vector? A low half may also come through as an
	// EXTRACT_SUBREG, so look for that, too.
	SDValue Op0 = N->getOperand(0);
	if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
	!(Op0->isMachineOpcode() &&
	Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
	return SDValue();
	uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
	if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
	return SDValue();
	} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
	if (idx != AArch64::dsub)
	return SDValue();
	// The dsub reference is equivalent to a lane zero subvector reference.
	idx = 0;
	}
	// Look through the bitcast of the input to the extract.
	if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue Source = Op0->getOperand(0)->getOperand(0);
	// If the source type has twice the number of elements as our destination
	// type, we know this is an extract of the high or low half of the vector.
	EVT SVT = Source->getValueType(0);
	if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
	return SDValue();

	DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");

	// Create the simplified form to just extract the low or high half of the
	// vector directly rather than bothering with the bitcasts.
	SDLoc dl(N);
	unsigned NumElements = VT.getVectorNumElements();
	if (idx) {
	SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
	} else {
	SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
	return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
	Source, SubReg),
	0);
	}
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 &&
	N0->getOpcode() == ISD::TRUNCATE &&
	N1->getOpcode() == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractSubvector(SDValue N) {
	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	return true;

	return N.getOpcode() == ISD::BITCAST &&
	N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
	}

	/// \brief Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// \brief Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// \brief Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
	*InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
	CCVal, DAG, dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit vector, we can play games and get the
	// better results we want.
	if (SrcVT.getSizeInBits() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	unsigned ElementCount = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	unsigned NumElements = ResVT.getVectorNumElements();
	assert(!(NumElements & 1) && "Splitting vector, but not in half!");
	LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
	ResVT.getVectorElementType(), NumElements / 2);

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorNumElements());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	+ const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	- DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
	+ DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	- St.getPointerInfo(), Alignment,
	+ PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
	// undoing this transformation.
	SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
	? DAG.getRegister(AArch64::WZR, MVT::i32)
	: DAG.getRegister(AArch64::XZR, MVT::i64);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (!DCI.isBeforeLegalize())
	return SDValue();

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();
	if (!VT.isVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);
	unsigned NumElts = VT.getVectorNumElements() / 2;
	// Split VT into two.
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load. Otherwise, folding it
	// would create a cycle.
	if (User->isPredecessorOf(LD) \|\| LD->isPredecessorOf(User))
	continue;
	// Also check that add is not used in the vector operand. This would also
	// create a cycle.
	if (User->isPredecessorOf(Vector.getNode()))
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// Finally, check that the vector doesn't depend on the load.
	// Again, this would create a cycle.
	// The load depending on the vector is fine, as that's the case for the
	// LD1*post we'll eventually generate anyway.
	if (LoadSDN->isPredecessorOf(Vector.getNode()))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify \Addr given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	APInt KnownZero, KnownOne;
	TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
	DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}

	/// This function handles the log2-shuffle pattern produced by the
	/// LoopVectorizer for the across vector reduction. It consists of
	/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
	/// are reduced, where s is an induction variable from 0 to
	/// log2(NumVectorElements).
	static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
	unsigned Op,
	SelectionDAG &DAG) {
	EVT VTy = OpV->getOperand(0).getValueType();
	if (!VTy.isVector())
	return SDValue();

	int NumVecElts = VTy.getVectorNumElements();
	if (Op == ISD::FMAXNUM \|\| Op == ISD::FMINNUM) {
	if (NumVecElts != 4)
	return SDValue();
	} else {
	if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
	return SDValue();
	}

	int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
	SDValue PreOp = OpV;
	// Iterate over each step of the across vector reduction.
	for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
	SDValue CurOp = PreOp.getOperand(0);
	SDValue Shuffle = PreOp.getOperand(1);
	if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
	// Try to swap the 1st and 2nd operand as add and min/max instructions
	// are commutative.
	CurOp = PreOp.getOperand(1);
	Shuffle = PreOp.getOperand(0);
	if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
	return SDValue();
	}

	// Check if the input vector is fed by the operator we want to handle,
	// except the last step; the very first input vector is not necessarily
	// the same operator we are handling.
	if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
	return SDValue();

	// Check if it forms one step of the across vector reduction.
	// E.g.,
	// %cur = add %1, %0
	// %shuffle = vector_shuffle %cur, <2, 3, u, u>
	// %pre = add %cur, %shuffle
	if (Shuffle.getOperand(0) != CurOp)
	return SDValue();

	int NumMaskElts = 1 << CurStep;
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
	// Check mask values in each step.
	// We expect the shuffle mask in each step follows a specific pattern
	// denoted here by the <M, U> form, where M is a sequence of integers
	// starting from NumMaskElts, increasing by 1, and the number integers
	// in M should be NumMaskElts. U is a sequence of UNDEFs and the number
	// of undef in U should be NumVecElts - NumMaskElts.
	// E.g., for <8 x i16>, mask values in each step should be :
	// step 0 : <1,u,u,u,u,u,u,u>
	// step 1 : <2,3,u,u,u,u,u,u>
	// step 2 : <4,5,6,7,u,u,u,u>
	for (int i = 0; i < NumVecElts; ++i)
	if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) \|\|
	(i >= NumMaskElts && !(Mask[i] < 0)))
	return SDValue();

	PreOp = CurOp;
	}
	unsigned Opcode;
	bool IsIntrinsic = false;

	switch (Op) {
	default:
	llvm_unreachable("Unexpected operator for across vector reduction");
	case ISD::ADD:
	Opcode = AArch64ISD::UADDV;
	break;
	case ISD::SMAX:
	Opcode = AArch64ISD::SMAXV;
	break;
	case ISD::UMAX:
	Opcode = AArch64ISD::UMAXV;
	break;
	case ISD::SMIN:
	Opcode = AArch64ISD::SMINV;
	break;
	case ISD::UMIN:
	Opcode = AArch64ISD::UMINV;
	break;
	case ISD::FMAXNUM:
	Opcode = Intrinsic::aarch64_neon_fmaxnmv;
	IsIntrinsic = true;
	break;
	case ISD::FMINNUM:
	Opcode = Intrinsic::aarch64_neon_fminnmv;
	IsIntrinsic = true;
	break;
	}
	SDLoc DL(N);

	return IsIntrinsic
	? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
	DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
	: DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
	DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
	DAG.getConstant(0, DL, MVT::i64));
	}

	/// Target-specific DAG combine for the across vector min/max reductions.
	/// This function specifically handles the final clean-up step of the vector
	/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
	/// pattern, which narrows down and finds the final min/max value from all
	/// elements of the vector.
	/// For example, for a <16 x i8> vector :
	/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
	/// %smax0 = smax %arr, svn0
	/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
	/// %smax1 = smax %smax0, %svn1
	/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	/// %smax2 = smax %smax1, svn2
	/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
	/// %sc = setcc %smax2, %svn3, gt
	/// %n0 = extract_vector_elt %sc, #0
	/// %n1 = extract_vector_elt %smax2, #0
	/// %n2 = extract_vector_elt $smax2, #1
	/// %result = select %n0, %n1, n2
	/// becomes :
	/// %1 = smaxv %0
	/// %result = extract_vector_elt %1, 0
	static SDValue
	performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);

	// Check if the SELECT merges up the final result of the min/max
	// from a vector.
	if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Expect N0 is fed by SETCC.
	SDValue SetCC = N0.getOperand(0);
	EVT SetCCVT = SetCC.getValueType();
	if (SetCC.getOpcode() != ISD::SETCC \|\| !SetCCVT.isVector() \|\|
	SetCCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	SDValue VectorOp = SetCC.getOperand(0);
	unsigned Op = VectorOp->getOpcode();
	// Check if the input vector is fed by the operator we want to handle.
	if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
	Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
	return SDValue();

	EVT VTy = VectorOp.getValueType();
	if (!VTy.isVector())
	return SDValue();

	if (VTy.getSizeInBits() < 64)
	return SDValue();

	EVT EltTy = VTy.getVectorElementType();
	if (Op == ISD::FMAXNUM \|\| Op == ISD::FMINNUM) {
	if (EltTy != MVT::f32)
	return SDValue();
	} else {
	if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
	return SDValue();
	}

	// Check if extracting from the same vector.
	// For example,
	// %sc = setcc %vector, %svn1, gt
	// %n0 = extract_vector_elt %sc, #0
	// %n1 = extract_vector_elt %vector, #0
	// %n2 = extract_vector_elt $vector, #1
	if (!(VectorOp == IfTrue->getOperand(0) &&
	VectorOp == IfFalse->getOperand(0)))
	return SDValue();

	// Check if the condition code is matched with the operator type.
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) \|\|
	(Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) \|\|
	(Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) \|\|
	(Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) \|\|
	(Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
	CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
	CC != ISD::SETGE) \|\|
	(Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
	CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
	CC != ISD::SETLE))
	return SDValue();

	// Expect to check only lane 0 from the vector SETCC.
	if (!isNullConstant(N0.getOperand(1)))
	return SDValue();

	// Expect to extract the true value from lane 0.
	if (!isNullConstant(IfTrue.getOperand(1)))
	return SDValue();

	// Expect to extract the false value from lane 1.
	if (!isOneConstant(IfFalse.getOperand(1)))
	return SDValue();

	return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
	}

	/// Target-specific DAG combine for the across vector add reduction.
	/// This function specifically handles the final clean-up step of the vector
	/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
	/// pattern, which adds all elements of a vector together.
	/// For example, for a <4 x i32> vector :
	/// %1 = vector_shuffle %0, <2,3,u,u>
	/// %2 = add %0, %1
	/// %3 = vector_shuffle %2, <1,u,u,u>
	/// %4 = add %2, %3
	/// %result = extract_vector_elt %4, 0
	/// becomes :
	/// %0 = uaddv %0
	/// %result = extract_vector_elt %0, 0
	static SDValue
	performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check if the input vector is fed by the ADD.
	if (N0->getOpcode() != ISD::ADD)
	return SDValue();

	// The vector extract idx must constant zero because we only expect the final
	// result of the reduction is placed in lane 0.
	if (!isNullConstant(N1))
	return SDValue();

	EVT VTy = N0.getValueType();
	if (!VTy.isVector())
	return SDValue();

	EVT EltTy = VTy.getVectorElementType();
	if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
	return SDValue();

	if (VTy.getSizeInBits() < 64)
	return SDValue();

	return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
	}

	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	if (User->isPredecessorOf(N) \|\| N->isPredecessorOf(User))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::BITCAST:
	return performBitcastCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT: {
	SDValue RV = performSelectCombine(N, DCI);
	if (!RV.getNode())
	RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
	return RV;
	}
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::EXTRACT_VECTOR_ELT:
	return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	default:
	break;
	}
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\| Op.getValueType() != MVT::f16)
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> & Results,
	SelectionDAG &DAG) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");
	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineFunction &MF = DAG.getMachineFunction();
	MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
	MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
	cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);

	Results.push_back(SDValue(CmpSwap, 0));
	Results.push_back(SDValue(CmpSwap, 1));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG);
	return;
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (!Subtarget->isTargetAndroid())
	return true;
	return TargetLowering::useLoadStackGuardNode();
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
	MVT SVT = VT.getSimpleVT();
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (SVT == MVT::v1i8 \|\| SVT == MVT::v1i16 \|\| SVT == MVT::v1i32
	\|\| SVT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	return getTargetMachine().getOptLevel() != 0;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateTruncOrBitCast(
	Builder.CreateCall(Ldxr, Addr),
	cast<PointerType>(Addr->getType())->getElementType());
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	if (!Subtarget->isTargetAndroid())
	return TargetLowering::getIRStackGuard(IRB);

	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	const unsigned TlsOffset = 0x28;
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (!Subtarget->isTargetAndroid())
	return TargetLowering::getSafeStackPointerLocation(IRB);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	const unsigned TlsOffset = 0x48;
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction()->hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}
	Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp (revision 313643)
	@@ -1,297 +1,278 @@
	//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains a pass that expands pseudo instructions into target
	// instructions to allow proper scheduling, if-conversion, other late
	// optimizations, or simply the encoding of the instructions.
	//
	//===----------------------------------------------------------------------===//

	#include "X86.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86InstrInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86Subtarget.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
	#include "llvm/IR/GlobalValue.h"
	using namespace llvm;

	#define DEBUG_TYPE "x86-pseudo"

	namespace {
	class X86ExpandPseudo : public MachineFunctionPass {
	public:
	static char ID;
	X86ExpandPseudo() : MachineFunctionPass(ID) {}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addPreservedID(MachineLoopInfoID);
	AU.addPreservedID(MachineDominatorsID);
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	const X86Subtarget *STI;
	const X86InstrInfo *TII;
	const X86RegisterInfo *TRI;
	const X86MachineFunctionInfo *X86FI;
	const X86FrameLowering *X86FL;

	bool runOnMachineFunction(MachineFunction &Fn) override;

	MachineFunctionProperties getRequiredProperties() const override {
	return MachineFunctionProperties().set(
	MachineFunctionProperties::Property::NoVRegs);
	}

	StringRef getPassName() const override {
	return "X86 pseudo instruction expansion pass";
	}

	private:
	bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
	bool ExpandMBB(MachineBasicBlock &MBB);
	};
	char X86ExpandPseudo::ID = 0;
	} // End anonymous namespace.

	/// If \p MBBI is a pseudo instruction, this method expands
	/// it to the corresponding (sequence of) actual instruction(s).
	/// \returns true if \p MBBI has been expanded.
	bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI) {
	MachineInstr &MI = *MBBI;
	unsigned Opcode = MI.getOpcode();
	DebugLoc DL = MBBI->getDebugLoc();
	switch (Opcode) {
	default:
	return false;
	case X86::TCRETURNdi:
	- case X86::TCRETURNdicc:
	case X86::TCRETURNri:
	case X86::TCRETURNmi:
	case X86::TCRETURNdi64:
	- case X86::TCRETURNdi64cc:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64: {
	bool isMem = Opcode == X86::TCRETURNmi \|\| Opcode == X86::TCRETURNmi64;
	MachineOperand &JumpTarget = MBBI->getOperand(0);
	MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
	assert(StackAdjust.isImm() && "Expecting immediate value.");

	// Adjust stack pointer.
	int StackAdj = StackAdjust.getImm();
	int MaxTCDelta = X86FI->getTCReturnAddrDelta();
	int Offset = 0;
	assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");

	// Incoporate the retaddr area.
	Offset = StackAdj - MaxTCDelta;
	assert(Offset >= 0 && "Offset should never be negative");

	- if (Opcode == X86::TCRETURNdicc \|\| Opcode == X86::TCRETURNdi64cc) {
	- assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
	- }
	-
	if (Offset) {
	// Check for possible merge with preceding ADD instruction.
	Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
	X86FL->emitSPUpdate(MBB, MBBI, Offset, /InEpilogue=/true);
	}

	// Jump to label or value in register.
	bool IsWin64 = STI->isTargetWin64();
	- if (Opcode == X86::TCRETURNdi \|\| Opcode == X86::TCRETURNdicc \|\|
	- Opcode == X86::TCRETURNdi64 \|\| Opcode == X86::TCRETURNdi64cc) {
	+ if (Opcode == X86::TCRETURNdi \|\| Opcode == X86::TCRETURNdi64) {
	unsigned Op;
	switch (Opcode) {
	case X86::TCRETURNdi:
	Op = X86::TAILJMPd;
	break;
	- case X86::TCRETURNdicc:
	- Op = X86::TAILJMPd_CC;
	- break;
	- case X86::TCRETURNdi64cc:
	- assert(!IsWin64 && "Conditional tail calls confuse the Win64 unwinder.");
	- // TODO: We could do it for Win64 "leaf" functions though; PR30337.
	- Op = X86::TAILJMPd64_CC;
	- break;
	default:
	// Note: Win64 uses REX prefixes indirect jumps out of functions, but
	// not direct ones.
	Op = X86::TAILJMPd64;
	break;
	}
	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
	if (JumpTarget.isGlobal()) {
	MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
	JumpTarget.getTargetFlags());
	} else {
	assert(JumpTarget.isSymbol());
	MIB.addExternalSymbol(JumpTarget.getSymbolName(),
	JumpTarget.getTargetFlags());
	}
	- if (Op == X86::TAILJMPd_CC \|\| Op == X86::TAILJMPd64_CC) {
	- MIB.addImm(MBBI->getOperand(2).getImm());
	- }
	-
	} else if (Opcode == X86::TCRETURNmi \|\| Opcode == X86::TCRETURNmi64) {
	unsigned Op = (Opcode == X86::TCRETURNmi)
	? X86::TAILJMPm
	: (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
	for (unsigned i = 0; i != 5; ++i)
	MIB.addOperand(MBBI->getOperand(i));
	} else if (Opcode == X86::TCRETURNri64) {
	BuildMI(MBB, MBBI, DL,
	TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
	.addReg(JumpTarget.getReg(), RegState::Kill);
	} else {
	BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
	.addReg(JumpTarget.getReg(), RegState::Kill);
	}

	MachineInstr &NewMI = *std::prev(MBBI);
	NewMI.copyImplicitOps(MBBI->getParent()->getParent(), MBBI);

	// Delete the pseudo instruction TCRETURN.
	MBB.erase(MBBI);

	return true;
	}
	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	MachineOperand &DestAddr = MBBI->getOperand(0);
	assert(DestAddr.isReg() && "Offset should be in register!");
	const bool Uses64BitFramePtr =
	STI->isTarget64BitLP64() \|\| STI->isTargetNaCl64();
	unsigned StackPtr = TRI->getStackRegister();
	BuildMI(MBB, MBBI, DL,
	TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
	.addReg(DestAddr.getReg());
	// The EH_RETURN pseudo is really removed during the MC Lowering.
	return true;
	}
	case X86::IRET: {
	// Adjust stack to erase error code
	int64_t StackAdj = MBBI->getOperand(0).getImm();
	X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
	// Replace pseudo with machine iret
	BuildMI(MBB, MBBI, DL,
	TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
	MBB.erase(MBBI);
	return true;
	}
	case X86::RET: {
	// Adjust stack to erase error code
	int64_t StackAdj = MBBI->getOperand(0).getImm();
	MachineInstrBuilder MIB;
	if (StackAdj == 0) {
	MIB = BuildMI(MBB, MBBI, DL,
	TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
	} else if (isUInt<16>(StackAdj)) {
	MIB = BuildMI(MBB, MBBI, DL,
	TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
	.addImm(StackAdj);
	} else {
	assert(!STI->is64Bit() &&
	"shouldn't need to do this for x86_64 targets!");
	// A ret can only handle immediates as big as 2**16-1. If we need to pop
	// off bytes before the return address, we must do it manually.
	BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
	X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /InEpilogue=/true);
	BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
	MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
	}
	for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
	MIB.addOperand(MBBI->getOperand(I));
	MBB.erase(MBBI);
	return true;
	}
	case X86::EH_RESTORE: {
	// Restore ESP and EBP, and optionally ESI if required.
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
	MBB.getParent()->getFunction()->getPersonalityFn()));
	X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /RestoreSP=/IsSEH);
	MBBI->eraseFromParent();
	return true;
	}
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	// Perform the following transformation.
	// SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
	// =>
	// [E\|R]BX = InArg
	// actualcmpxchg Addr
	// [E\|R]BX = SaveRbx
	const MachineOperand &InArg = MBBI->getOperand(6);
	unsigned SaveRbx = MBBI->getOperand(7).getReg();

	unsigned ActualInArg =
	Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	// Copy the input argument of the pseudo into the argument of the
	// actual instruction.
	TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
	InArg.isKill());
	// Create the actual instruction.
	unsigned ActualOpc =
	Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
	MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
	// Copy the operands related to the address.
	for (unsigned Idx = 1; Idx < 6; ++Idx)
	NewInstr->addOperand(MBBI->getOperand(Idx));
	// Finally, restore the value of RBX.
	TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
	/SrcIsKill/ true);

	// Delete the pseudo.
	MBBI->eraseFromParent();
	return true;
	}
	}
	llvm_unreachable("Previous switch has a fallthrough?");
	}

	/// Expand all pseudo instructions contained in \p MBB.
	/// \returns true if any expansion occurred for \p MBB.
	bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
	bool Modified = false;

	// MBBI may be invalidated by the expansion.
	MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	while (MBBI != E) {
	MachineBasicBlock::iterator NMBBI = std::next(MBBI);
	Modified \|= ExpandMI(MBB, MBBI);
	MBBI = NMBBI;
	}

	return Modified;
	}

	bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
	STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
	TII = STI->getInstrInfo();
	TRI = STI->getRegisterInfo();
	X86FI = MF.getInfo<X86MachineFunctionInfo>();
	X86FL = STI->getFrameLowering();

	bool Modified = false;
	for (MachineBasicBlock &MBB : MF)
	Modified \|= ExpandMBB(MBB);
	return Modified;
	}

	/// Returns an instance of the pseudo instruction expansion pass.
	FunctionPass *llvm::createX86ExpandPseudoPass() {
	return new X86ExpandPseudo();
	}
	Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 313643)
	@@ -1,358 +1,327 @@
	//===-- X86InstrControl.td - Control Flow Instructions ------ tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 jump, return, call, and related instructions.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Control Flow Instructions.
	//

	// Return instructions.
	//
	// The X86retflag return instructions are variadic because we may add ST0 and
	// ST1 arguments when returning values on the x87 stack.
	let isTerminator = 1, isReturn = 1, isBarrier = 1,
	hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
	def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{l}", [], IIC_RET>, OpSize32,
	Requires<[Not64BitMode]>;
	def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
	"ret{q}", [], IIC_RET>, OpSize32,
	Requires<[In64BitMode]>;
	def RETW : I <0xC3, RawFrm, (outs), (ins),
	"ret{w}",
	[], IIC_RET>, OpSize16;
	def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{l}\t$amt",
	[], IIC_RET_IMM>, OpSize32,
	Requires<[Not64BitMode]>;
	def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
	"ret{q}\t$amt",
	[], IIC_RET_IMM>, OpSize32,
	Requires<[In64BitMode]>;
	def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
	"ret{w}\t$amt",
	[], IIC_RET_IMM>, OpSize16;
	def LRETL : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{l\|f}", [], IIC_RET>, OpSize32;
	def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
	"{l}ret{\|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
	def LRETW : I <0xCB, RawFrm, (outs), (ins),
	"{l}ret{w\|f}", [], IIC_RET>, OpSize16;
	def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{l\|f}\t$amt", [], IIC_RET>, OpSize32;
	def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{\|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
	def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
	"{l}ret{w\|f}\t$amt", [], IIC_RET>, OpSize16;

	// The machine return from interrupt instruction, but sometimes we need to
	// perform a post-epilogue stack adjustment. Codegen emits the pseudo form
	// which expands to include an SP adjustment if necessary.
	def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
	OpSize16;
	def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l\|d}", [],
	IIC_IRET>, OpSize32;
	def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
	IIC_IRET>, Requires<[In64BitMode]>;
	let isCodeGenOnly = 1 in
	def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
	def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
	}

	// Unconditional branches.
	let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
	def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
	"jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
	let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
	def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
	}
	}

	// Conditional Branches.
	let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
	multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
	def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
	[(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
	let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
	def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
	[], IIC_Jcc>, OpSize16, TB;
	def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
	[], IIC_Jcc>, TB, OpSize32;
	}
	}
	}

	defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
	defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
	defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
	defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
	defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
	defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
	defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
	defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
	defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
	defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
	defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
	defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
	defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
	defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
	defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
	defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;

	// jcx/jecx/jrcx instructions.
	let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
	// These are the 32-bit versions of this instruction for the asmparser. In
	// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
	// jecxz.
	let Uses = [CX] in
	def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
	Requires<[Not64BitMode]>;
	let Uses = [ECX] in
	def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jecxz\t$dst", [], IIC_JCXZ>, AdSize32;

	let Uses = [RCX] in
	def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
	"jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
	Requires<[In64BitMode]>;
	}

	// Indirect branches
	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
	[(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
	OpSize16, Sched<[WriteJump]>;
	def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
	[(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
	Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;

	def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
	[(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
	OpSize32, Sched<[WriteJump]>;
	def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
	[(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
	Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;

	def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
	[(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
	Sched<[WriteJump]>;
	def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
	[(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
	Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;

	let Predicates = [Not64BitMode] in {
	def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"ljmp{w}\t$seg, $off", [],
	IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
	def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"ljmp{l}\t$seg, $off", [],
	IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
	}
	def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
	"ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
	Sched<[WriteJump]>;

	def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
	"ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
	Sched<[WriteJumpLd]>;
	def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
	"ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
	Sched<[WriteJumpLd]>;
	}


	// Loop instructions
	let SchedRW = [WriteJump] in {
	def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
	def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
	def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
	}

	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//
	let isCall = 1 in
	// All calls clobber the non-callee saved registers. ESP is marked as
	// a use to prevent stack-pointer assignments that appear immediately
	// before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let Uses = [ESP] in {
	def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i32imm_pcrel:$dst),
	"call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
	Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	let hasSideEffects = 0 in
	def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
	(outs), (ins i16imm_pcrel:$dst),
	"call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
	Sched<[WriteJump]>;
	def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
	"call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
	OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
	"call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
	IIC_CALL_MEM>, OpSize16,
	Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>;
	def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
	"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
	OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
	def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
	"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
	IIC_CALL_MEM>, OpSize32,
	Requires<[Not64BitMode,FavorMemIndirectCall]>,
	Sched<[WriteJumpLd]>;

	let Predicates = [Not64BitMode] in {
	def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
	(ins i16imm:$off, i16imm:$seg),
	"lcall{w}\t$seg, $off", [],
	IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
	def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
	(ins i32imm:$off, i16imm:$seg),
	"lcall{l}\t$seg, $off", [],
	IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
	}

	def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
	"lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
	Sched<[WriteJumpLd]>;
	def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
	"lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
	Sched<[WriteJumpLd]>;
	}


	// Tail call stuff.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	let Uses = [ESP] in {
	def TCRETURNdi : PseudoI<(outs),
	(ins i32imm_pcrel:$dst, i32imm:$offset), []>;
	def TCRETURNri : PseudoI<(outs),
	(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
	let mayLoad = 1 in
	def TCRETURNmi : PseudoI<(outs),
	(ins i32mem_TC:$dst, i32imm:$offset), []>;

	// FIXME: The should be pseudo instructions that are lowered when going to
	// mcinst.
	def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
	(ins i32imm_pcrel:$dst),
	"jmp\t$dst",
	[], IIC_JMP_REL>;

	def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
	let mayLoad = 1 in
	def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
	"jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
	}

	-// Conditional tail calls are similar to the above, but they are branches
	-// rather than barriers, and they use EFLAGS.
	-let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	- let Uses = [ESP, EFLAGS] in {
	- def TCRETURNdicc : PseudoI<(outs),
	- (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;

	- // This gets substituted to a conditional jump instruction in MC lowering.
	- def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
	- (ins i32imm_pcrel:$dst, i32imm:$cond),
	- "",
	- [], IIC_JMP_REL>;
	-}
	-
	-
	//===----------------------------------------------------------------------===//
	// Call Instructions...
	//

	// RSP is marked as a use to prevent stack-pointer assignments that appear
	// immediately before calls from potentially appearing dead. Uses for argument
	// registers are added manually.
	let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
	// NOTE: this pattern doesn't match "X86call imm", because we do not know
	// that the offset between an arbitrary immediate and the call will fit in
	// the 32-bit pcrel field that we have.
	def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
	(outs), (ins i64i32imm_pcrel:$dst),
	"call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
	Requires<[In64BitMode]>;
	def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
	"call{q}\t{*}$dst", [(X86call GR64:$dst)],
	IIC_CALL_RI>,
	Requires<[In64BitMode]>;
	def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
	"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
	IIC_CALL_MEM>,
	Requires<[In64BitMode,FavorMemIndirectCall]>;

	def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
	"lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
	}

	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
	isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
	SchedRW = [WriteJump] in {
	def TCRETURNdi64 : PseudoI<(outs),
	(ins i64i32imm_pcrel:$dst, i32imm:$offset),
	[]>;
	def TCRETURNri64 : PseudoI<(outs),
	(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
	let mayLoad = 1 in
	def TCRETURNmi64 : PseudoI<(outs),
	(ins i64mem_TC:$dst, i32imm:$offset), []>;

	def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
	"jmp\t$dst", [], IIC_JMP_REL>;

	def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	let mayLoad = 1 in
	def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
	"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
	let hasREX_WPrefix = 1 in {
	def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
	"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;

	let mayLoad = 1 in
	def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
	"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
	}
	-}
	-
	-// Conditional tail calls are similar to the above, but they are branches
	-// rather than barriers, and they use EFLAGS.
	-let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
	- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
	- let Uses = [RSP, EFLAGS] in {
	- def TCRETURNdi64cc : PseudoI<(outs),
	- (ins i64i32imm_pcrel:$dst, i32imm:$offset,
	- i32imm:$cond), []>;
	-
	- // This gets substituted to a conditional jump instruction in MC lowering.
	- def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
	- (ins i64i32imm_pcrel:$dst, i32imm:$cond),
	- "",
	- [], IIC_JMP_REL>;
	}
	Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp (revision 313643)
	@@ -1,9746 +1,9667 @@
	//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the X86 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#include "X86InstrInfo.h"
	#include "X86.h"
	#include "X86InstrBuilder.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86Subtarget.h"
	#include "X86TargetMachine.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/LiveVariables.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetOptions.h"

	using namespace llvm;

	#define DEBUG_TYPE "x86-instr-info"

	#define GET_INSTRINFO_CTOR_DTOR
	#include "X86GenInstrInfo.inc"

	static cl::opt<bool>
	NoFusing("disable-spill-fusing",
	cl::desc("Disable fusing of spill code into instructions"));
	static cl::opt<bool>
	PrintFailedFusing("print-failed-fuse-candidates",
	cl::desc("Print instructions that the allocator wants to"
	" fuse, but the X86 backend currently can't"),
	cl::Hidden);
	static cl::opt<bool>
	ReMatPICStubLoad("remat-pic-stub-load",
	cl::desc("Re-materialize load from stub in PIC mode"),
	cl::init(false), cl::Hidden);
	static cl::opt<unsigned>
	PartialRegUpdateClearance("partial-reg-update-clearance",
	cl::desc("Clearance between two register writes "
	"for inserting XOR to avoid partial "
	"register update"),
	cl::init(64), cl::Hidden);
	static cl::opt<unsigned>
	UndefRegClearance("undef-reg-clearance",
	cl::desc("How many idle instructions we would like before "
	"certain undef register reads"),
	cl::init(128), cl::Hidden);

	enum {
	// Select which memory operand is being unfolded.
	// (stored in bits 0 - 3)
	TB_INDEX_0 = 0,
	TB_INDEX_1 = 1,
	TB_INDEX_2 = 2,
	TB_INDEX_3 = 3,
	TB_INDEX_4 = 4,
	TB_INDEX_MASK = 0xf,

	// Do not insert the reverse map (MemOp -> RegOp) into the table.
	// This may be needed because there is a many -> one mapping.
	TB_NO_REVERSE = 1 << 4,

	// Do not insert the forward map (RegOp -> MemOp) into the table.
	// This is needed for Native Client, which prohibits branch
	// instructions from using a memory operand.
	TB_NO_FORWARD = 1 << 5,

	TB_FOLDED_LOAD = 1 << 6,
	TB_FOLDED_STORE = 1 << 7,

	// Minimum alignment required for load/store.
	// Used for RegOp->MemOp conversion.
	// (stored in bits 8 - 15)
	TB_ALIGN_SHIFT = 8,
	TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
	TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
	TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
	TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
	TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
	};

	struct X86MemoryFoldTableEntry {
	uint16_t RegOp;
	uint16_t MemOp;
	uint16_t Flags;
	};

	// Pin the vtable to this file.
	void X86InstrInfo::anchor() {}

	X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
	: X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
	: X86::ADJCALLSTACKDOWN32),
	(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
	: X86::ADJCALLSTACKUP32),
	X86::CATCHRET,
	(STI.is64Bit() ? X86::RETQ : X86::RETL)),
	Subtarget(STI), RI(STI.getTargetTriple()) {

	static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
	{ X86::ADC32ri, X86::ADC32mi, 0 },
	{ X86::ADC32ri8, X86::ADC32mi8, 0 },
	{ X86::ADC32rr, X86::ADC32mr, 0 },
	{ X86::ADC64ri32, X86::ADC64mi32, 0 },
	{ X86::ADC64ri8, X86::ADC64mi8, 0 },
	{ X86::ADC64rr, X86::ADC64mr, 0 },
	{ X86::ADD16ri, X86::ADD16mi, 0 },
	{ X86::ADD16ri8, X86::ADD16mi8, 0 },
	{ X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
	{ X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
	{ X86::ADD16rr, X86::ADD16mr, 0 },
	{ X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
	{ X86::ADD32ri, X86::ADD32mi, 0 },
	{ X86::ADD32ri8, X86::ADD32mi8, 0 },
	{ X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
	{ X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
	{ X86::ADD32rr, X86::ADD32mr, 0 },
	{ X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
	{ X86::ADD64ri32, X86::ADD64mi32, 0 },
	{ X86::ADD64ri8, X86::ADD64mi8, 0 },
	{ X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
	{ X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
	{ X86::ADD64rr, X86::ADD64mr, 0 },
	{ X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
	{ X86::ADD8ri, X86::ADD8mi, 0 },
	{ X86::ADD8rr, X86::ADD8mr, 0 },
	{ X86::AND16ri, X86::AND16mi, 0 },
	{ X86::AND16ri8, X86::AND16mi8, 0 },
	{ X86::AND16rr, X86::AND16mr, 0 },
	{ X86::AND32ri, X86::AND32mi, 0 },
	{ X86::AND32ri8, X86::AND32mi8, 0 },
	{ X86::AND32rr, X86::AND32mr, 0 },
	{ X86::AND64ri32, X86::AND64mi32, 0 },
	{ X86::AND64ri8, X86::AND64mi8, 0 },
	{ X86::AND64rr, X86::AND64mr, 0 },
	{ X86::AND8ri, X86::AND8mi, 0 },
	{ X86::AND8rr, X86::AND8mr, 0 },
	{ X86::DEC16r, X86::DEC16m, 0 },
	{ X86::DEC32r, X86::DEC32m, 0 },
	{ X86::DEC64r, X86::DEC64m, 0 },
	{ X86::DEC8r, X86::DEC8m, 0 },
	{ X86::INC16r, X86::INC16m, 0 },
	{ X86::INC32r, X86::INC32m, 0 },
	{ X86::INC64r, X86::INC64m, 0 },
	{ X86::INC8r, X86::INC8m, 0 },
	{ X86::NEG16r, X86::NEG16m, 0 },
	{ X86::NEG32r, X86::NEG32m, 0 },
	{ X86::NEG64r, X86::NEG64m, 0 },
	{ X86::NEG8r, X86::NEG8m, 0 },
	{ X86::NOT16r, X86::NOT16m, 0 },
	{ X86::NOT32r, X86::NOT32m, 0 },
	{ X86::NOT64r, X86::NOT64m, 0 },
	{ X86::NOT8r, X86::NOT8m, 0 },
	{ X86::OR16ri, X86::OR16mi, 0 },
	{ X86::OR16ri8, X86::OR16mi8, 0 },
	{ X86::OR16rr, X86::OR16mr, 0 },
	{ X86::OR32ri, X86::OR32mi, 0 },
	{ X86::OR32ri8, X86::OR32mi8, 0 },
	{ X86::OR32rr, X86::OR32mr, 0 },
	{ X86::OR64ri32, X86::OR64mi32, 0 },
	{ X86::OR64ri8, X86::OR64mi8, 0 },
	{ X86::OR64rr, X86::OR64mr, 0 },
	{ X86::OR8ri, X86::OR8mi, 0 },
	{ X86::OR8rr, X86::OR8mr, 0 },
	{ X86::ROL16r1, X86::ROL16m1, 0 },
	{ X86::ROL16rCL, X86::ROL16mCL, 0 },
	{ X86::ROL16ri, X86::ROL16mi, 0 },
	{ X86::ROL32r1, X86::ROL32m1, 0 },
	{ X86::ROL32rCL, X86::ROL32mCL, 0 },
	{ X86::ROL32ri, X86::ROL32mi, 0 },
	{ X86::ROL64r1, X86::ROL64m1, 0 },
	{ X86::ROL64rCL, X86::ROL64mCL, 0 },
	{ X86::ROL64ri, X86::ROL64mi, 0 },
	{ X86::ROL8r1, X86::ROL8m1, 0 },
	{ X86::ROL8rCL, X86::ROL8mCL, 0 },
	{ X86::ROL8ri, X86::ROL8mi, 0 },
	{ X86::ROR16r1, X86::ROR16m1, 0 },
	{ X86::ROR16rCL, X86::ROR16mCL, 0 },
	{ X86::ROR16ri, X86::ROR16mi, 0 },
	{ X86::ROR32r1, X86::ROR32m1, 0 },
	{ X86::ROR32rCL, X86::ROR32mCL, 0 },
	{ X86::ROR32ri, X86::ROR32mi, 0 },
	{ X86::ROR64r1, X86::ROR64m1, 0 },
	{ X86::ROR64rCL, X86::ROR64mCL, 0 },
	{ X86::ROR64ri, X86::ROR64mi, 0 },
	{ X86::ROR8r1, X86::ROR8m1, 0 },
	{ X86::ROR8rCL, X86::ROR8mCL, 0 },
	{ X86::ROR8ri, X86::ROR8mi, 0 },
	{ X86::SAR16r1, X86::SAR16m1, 0 },
	{ X86::SAR16rCL, X86::SAR16mCL, 0 },
	{ X86::SAR16ri, X86::SAR16mi, 0 },
	{ X86::SAR32r1, X86::SAR32m1, 0 },
	{ X86::SAR32rCL, X86::SAR32mCL, 0 },
	{ X86::SAR32ri, X86::SAR32mi, 0 },
	{ X86::SAR64r1, X86::SAR64m1, 0 },
	{ X86::SAR64rCL, X86::SAR64mCL, 0 },
	{ X86::SAR64ri, X86::SAR64mi, 0 },
	{ X86::SAR8r1, X86::SAR8m1, 0 },
	{ X86::SAR8rCL, X86::SAR8mCL, 0 },
	{ X86::SAR8ri, X86::SAR8mi, 0 },
	{ X86::SBB32ri, X86::SBB32mi, 0 },
	{ X86::SBB32ri8, X86::SBB32mi8, 0 },
	{ X86::SBB32rr, X86::SBB32mr, 0 },
	{ X86::SBB64ri32, X86::SBB64mi32, 0 },
	{ X86::SBB64ri8, X86::SBB64mi8, 0 },
	{ X86::SBB64rr, X86::SBB64mr, 0 },
	{ X86::SHL16r1, X86::SHL16m1, 0 },
	{ X86::SHL16rCL, X86::SHL16mCL, 0 },
	{ X86::SHL16ri, X86::SHL16mi, 0 },
	{ X86::SHL32r1, X86::SHL32m1, 0 },
	{ X86::SHL32rCL, X86::SHL32mCL, 0 },
	{ X86::SHL32ri, X86::SHL32mi, 0 },
	{ X86::SHL64r1, X86::SHL64m1, 0 },
	{ X86::SHL64rCL, X86::SHL64mCL, 0 },
	{ X86::SHL64ri, X86::SHL64mi, 0 },
	{ X86::SHL8r1, X86::SHL8m1, 0 },
	{ X86::SHL8rCL, X86::SHL8mCL, 0 },
	{ X86::SHL8ri, X86::SHL8mi, 0 },
	{ X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
	{ X86::SHLD16rri8, X86::SHLD16mri8, 0 },
	{ X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
	{ X86::SHLD32rri8, X86::SHLD32mri8, 0 },
	{ X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
	{ X86::SHLD64rri8, X86::SHLD64mri8, 0 },
	{ X86::SHR16r1, X86::SHR16m1, 0 },
	{ X86::SHR16rCL, X86::SHR16mCL, 0 },
	{ X86::SHR16ri, X86::SHR16mi, 0 },
	{ X86::SHR32r1, X86::SHR32m1, 0 },
	{ X86::SHR32rCL, X86::SHR32mCL, 0 },
	{ X86::SHR32ri, X86::SHR32mi, 0 },
	{ X86::SHR64r1, X86::SHR64m1, 0 },
	{ X86::SHR64rCL, X86::SHR64mCL, 0 },
	{ X86::SHR64ri, X86::SHR64mi, 0 },
	{ X86::SHR8r1, X86::SHR8m1, 0 },
	{ X86::SHR8rCL, X86::SHR8mCL, 0 },
	{ X86::SHR8ri, X86::SHR8mi, 0 },
	{ X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
	{ X86::SHRD16rri8, X86::SHRD16mri8, 0 },
	{ X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
	{ X86::SHRD32rri8, X86::SHRD32mri8, 0 },
	{ X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
	{ X86::SHRD64rri8, X86::SHRD64mri8, 0 },
	{ X86::SUB16ri, X86::SUB16mi, 0 },
	{ X86::SUB16ri8, X86::SUB16mi8, 0 },
	{ X86::SUB16rr, X86::SUB16mr, 0 },
	{ X86::SUB32ri, X86::SUB32mi, 0 },
	{ X86::SUB32ri8, X86::SUB32mi8, 0 },
	{ X86::SUB32rr, X86::SUB32mr, 0 },
	{ X86::SUB64ri32, X86::SUB64mi32, 0 },
	{ X86::SUB64ri8, X86::SUB64mi8, 0 },
	{ X86::SUB64rr, X86::SUB64mr, 0 },
	{ X86::SUB8ri, X86::SUB8mi, 0 },
	{ X86::SUB8rr, X86::SUB8mr, 0 },
	{ X86::XOR16ri, X86::XOR16mi, 0 },
	{ X86::XOR16ri8, X86::XOR16mi8, 0 },
	{ X86::XOR16rr, X86::XOR16mr, 0 },
	{ X86::XOR32ri, X86::XOR32mi, 0 },
	{ X86::XOR32ri8, X86::XOR32mi8, 0 },
	{ X86::XOR32rr, X86::XOR32mr, 0 },
	{ X86::XOR64ri32, X86::XOR64mi32, 0 },
	{ X86::XOR64ri8, X86::XOR64mi8, 0 },
	{ X86::XOR64rr, X86::XOR64mr, 0 },
	{ X86::XOR8ri, X86::XOR8mi, 0 },
	{ X86::XOR8rr, X86::XOR8mr, 0 }
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
	AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp,
	// Index 0, folded load and store, no alignment requirement.
	Entry.Flags \| TB_INDEX_0 \| TB_FOLDED_LOAD \| TB_FOLDED_STORE);
	}

	static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
	{ X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
	{ X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
	{ X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
	{ X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
	{ X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
	{ X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
	{ X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
	{ X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
	{ X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
	{ X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
	{ X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
	{ X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
	{ X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
	{ X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
	{ X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
	{ X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
	{ X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
	{ X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
	{ X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
	{ X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
	{ X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
	{ X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
	{ X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
	{ X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
	{ X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
	{ X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
	{ X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
	{ X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
	{ X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
	{ X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
	{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
	{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
	{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
	{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
	{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
	{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
	{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
	{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
	{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
	{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
	{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
	{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
	{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
	{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
	{ X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
	{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
	{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
	{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
	{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
	{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
	{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
	{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
	{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
	{ X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
	{ X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
	{ X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
	{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
	{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
	{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
	{ X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
	{ X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
	{ X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
	{ X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
	{ X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
	{ X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
	{ X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
	{ X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
	{ X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
	{ X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
	{ X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
	{ X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
	{ X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
	{ X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
	{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
	{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
	{ X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
	{ X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
	{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
	{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },

	// AVX 128-bit versions of foldable instructions
	{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
	{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
	{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
	{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
	{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
	{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
	{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
	{ X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
	{ X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },

	// AVX 256-bit foldable instructions
	{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
	{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
	{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },

	// AVX-512 foldable instructions
	{ X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
	{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
	{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
	{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE \| TB_ALIGN_64 },
	{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE \| TB_ALIGN_64 },
	{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE \| TB_ALIGN_64 },
	{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE \| TB_ALIGN_64 },
	{ X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
	{ X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
	{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
	{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
	{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
	{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
	{ X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
	{ X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
	{ X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
	{ X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
	{ X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
	{ X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
	{ X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
	{ X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
	{ X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
	{ X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },

	// AVX-512 foldable instructions (256-bit versions)
	{ X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
	{ X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
	{ X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
	{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE \| TB_ALIGN_32 },
	{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
	{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
	{ X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
	{ X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },

	// AVX-512 foldable instructions (128-bit versions)
	{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE \| TB_ALIGN_16 },
	{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
	{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
	{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },

	// F16C foldable instructions
	{ X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
	{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
	AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp, TB_INDEX_0 \| Entry.Flags);
	}

	static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
	{ X86::BSF16rr, X86::BSF16rm, 0 },
	{ X86::BSF32rr, X86::BSF32rm, 0 },
	{ X86::BSF64rr, X86::BSF64rm, 0 },
	{ X86::BSR16rr, X86::BSR16rm, 0 },
	{ X86::BSR32rr, X86::BSR32rm, 0 },
	{ X86::BSR64rr, X86::BSR64rm, 0 },
	{ X86::CMP16rr, X86::CMP16rm, 0 },
	{ X86::CMP32rr, X86::CMP32rm, 0 },
	{ X86::CMP64rr, X86::CMP64rm, 0 },
	{ X86::CMP8rr, X86::CMP8rm, 0 },
	{ X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
	{ X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
	{ X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
	{ X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
	{ X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
	{ X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
	{ X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
	{ X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
	{ X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
	{ X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
	{ X86::IMUL16rri, X86::IMUL16rmi, 0 },
	{ X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
	{ X86::IMUL32rri, X86::IMUL32rmi, 0 },
	{ X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
	{ X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
	{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
	{ X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
	{ X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
	{ X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
	{ X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
	{ X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
	{ X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
	{ X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
	{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
	{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
	{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
	{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
	{ X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
	{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
	{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
	{ X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
	{ X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
	{ X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
	{ X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
	{ X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
	{ X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
	{ X86::MOV16rr, X86::MOV16rm, 0 },
	{ X86::MOV32rr, X86::MOV32rm, 0 },
	{ X86::MOV64rr, X86::MOV64rm, 0 },
	{ X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
	{ X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
	{ X86::MOV8rr, X86::MOV8rm, 0 },
	{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
	{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
	{ X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
	{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
	{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
	{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
	{ X86::MOVDQUrr, X86::MOVDQUrm, 0 },
	{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
	{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
	{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
	{ X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
	{ X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
	{ X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
	{ X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
	{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
	{ X86::MOVUPDrr, X86::MOVUPDrm, 0 },
	{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
	{ X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
	{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
	{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
	{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
	{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
	{ X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
	{ X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
	{ X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
	{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
	{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
	{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
	{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
	{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
	{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
	{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
	{ X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
	{ X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
	{ X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
	{ X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
	{ X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
	{ X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
	{ X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
	{ X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
	{ X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
	{ X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
	{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
	{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
	{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
	{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
	{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
	{ X86::RCPSSr, X86::RCPSSm, 0 },
	{ X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
	{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
	{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
	{ X86::ROUNDSDr, X86::ROUNDSDm, 0 },
	{ X86::ROUNDSSr, X86::ROUNDSSm, 0 },
	{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
	{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
	{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
	{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
	{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
	{ X86::SQRTSDr, X86::SQRTSDm, 0 },
	{ X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
	{ X86::SQRTSSr, X86::SQRTSSm, 0 },
	{ X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
	{ X86::TEST16rr, X86::TEST16rm, 0 },
	{ X86::TEST32rr, X86::TEST32rm, 0 },
	{ X86::TEST64rr, X86::TEST64rm, 0 },
	{ X86::TEST8rr, X86::TEST8rm, 0 },
	// FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
	{ X86::UCOMISDrr, X86::UCOMISDrm, 0 },
	{ X86::UCOMISSrr, X86::UCOMISSrm, 0 },

	// MMX version of foldable instructions
	{ X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
	{ X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
	{ X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
	{ X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
	{ X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
	{ X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
	{ X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
	{ X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
	{ X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
	{ X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },

	// 3DNow! version of foldable instructions
	{ X86::PF2IDrr, X86::PF2IDrm, 0 },
	{ X86::PF2IWrr, X86::PF2IWrm, 0 },
	{ X86::PFRCPrr, X86::PFRCPrm, 0 },
	{ X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
	{ X86::PI2FDrr, X86::PI2FDrm, 0 },
	{ X86::PI2FWrr, X86::PI2FWrm, 0 },
	{ X86::PSWAPDrr, X86::PSWAPDrm, 0 },

	// AVX 128-bit versions of foldable instructions
	{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
	{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
	{ X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
	{ X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
	{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
	{ X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
	{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
	{ X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
	{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
	{ X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
	{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
	{ X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
	{ X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
	{ X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
	{ X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
	{ X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
	{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
	{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
	{ X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
	{ X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
	{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
	{ X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
	{ X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
	{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
	{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
	{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
	{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
	{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
	{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
	{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
	{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
	{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
	{ X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
	{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
	{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
	{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
	{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
	{ X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
	{ X86::VPABSBrr, X86::VPABSBrm, 0 },
	{ X86::VPABSDrr, X86::VPABSDrm, 0 },
	{ X86::VPABSWrr, X86::VPABSWrm, 0 },
	{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
	{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
	{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
	{ X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
	{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
	{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
	{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
	{ X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
	{ X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
	{ X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
	{ X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
	{ X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
	{ X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
	{ X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
	{ X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
	{ X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
	{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
	{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
	{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
	{ X86::VPTESTrr, X86::VPTESTrm, 0 },
	{ X86::VRCPPSr, X86::VRCPPSm, 0 },
	{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
	{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },
	{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
	{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
	{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
	{ X86::VTESTPDrr, X86::VTESTPDrm, 0 },
	{ X86::VTESTPSrr, X86::VTESTPSrm, 0 },
	{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
	{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },

	// AVX 256-bit foldable instructions
	{ X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
	{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
	{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
	{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
	{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
	{ X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
	{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
	{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
	{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
	{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
	{ X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
	{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
	{ X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
	{ X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
	{ X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
	{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
	{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
	{ X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
	{ X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
	{ X86::VPTESTYrr, X86::VPTESTYrm, 0 },
	{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
	{ X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
	{ X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
	{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
	{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
	{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
	{ X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
	{ X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },

	// AVX2 foldable instructions

	// VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
	// VBROADCASTS{SD}rm memory instructions were available from AVX1.
	// TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
	// on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
	// so they don't need an equivalent limitation.
	{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
	{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
	{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
	{ X86::VPABSBYrr, X86::VPABSBYrm, 0 },
	{ X86::VPABSDYrr, X86::VPABSDYrm, 0 },
	{ X86::VPABSWYrr, X86::VPABSWYrm, 0 },
	{ X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
	{ X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
	{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
	{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
	{ X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
	{ X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
	{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
	{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
	{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
	{ X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
	{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
	{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
	{ X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
	{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
	{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
	{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },

	// XOP foldable instructions
	{ X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
	{ X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
	{ X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
	{ X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
	{ X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
	{ X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
	{ X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
	{ X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
	{ X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
	{ X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
	{ X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
	{ X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
	{ X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
	{ X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
	{ X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
	{ X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
	{ X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
	{ X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
	{ X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
	{ X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
	{ X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
	{ X86::VPROTBri, X86::VPROTBmi, 0 },
	{ X86::VPROTBrr, X86::VPROTBmr, 0 },
	{ X86::VPROTDri, X86::VPROTDmi, 0 },
	{ X86::VPROTDrr, X86::VPROTDmr, 0 },
	{ X86::VPROTQri, X86::VPROTQmi, 0 },
	{ X86::VPROTQrr, X86::VPROTQmr, 0 },
	{ X86::VPROTWri, X86::VPROTWmi, 0 },
	{ X86::VPROTWrr, X86::VPROTWmr, 0 },
	{ X86::VPSHABrr, X86::VPSHABmr, 0 },
	{ X86::VPSHADrr, X86::VPSHADmr, 0 },
	{ X86::VPSHAQrr, X86::VPSHAQmr, 0 },
	{ X86::VPSHAWrr, X86::VPSHAWmr, 0 },
	{ X86::VPSHLBrr, X86::VPSHLBmr, 0 },
	{ X86::VPSHLDrr, X86::VPSHLDmr, 0 },
	{ X86::VPSHLQrr, X86::VPSHLQmr, 0 },
	{ X86::VPSHLWrr, X86::VPSHLWmr, 0 },

	// BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
	{ X86::BEXTR32rr, X86::BEXTR32rm, 0 },
	{ X86::BEXTR64rr, X86::BEXTR64rm, 0 },
	{ X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
	{ X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
	{ X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
	{ X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
	{ X86::BLCI32rr, X86::BLCI32rm, 0 },
	{ X86::BLCI64rr, X86::BLCI64rm, 0 },
	{ X86::BLCIC32rr, X86::BLCIC32rm, 0 },
	{ X86::BLCIC64rr, X86::BLCIC64rm, 0 },
	{ X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
	{ X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
	{ X86::BLCS32rr, X86::BLCS32rm, 0 },
	{ X86::BLCS64rr, X86::BLCS64rm, 0 },
	{ X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
	{ X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
	{ X86::BLSI32rr, X86::BLSI32rm, 0 },
	{ X86::BLSI64rr, X86::BLSI64rm, 0 },
	{ X86::BLSIC32rr, X86::BLSIC32rm, 0 },
	{ X86::BLSIC64rr, X86::BLSIC64rm, 0 },
	{ X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
	{ X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
	{ X86::BLSR32rr, X86::BLSR32rm, 0 },
	{ X86::BLSR64rr, X86::BLSR64rm, 0 },
	{ X86::BZHI32rr, X86::BZHI32rm, 0 },
	{ X86::BZHI64rr, X86::BZHI64rm, 0 },
	{ X86::LZCNT16rr, X86::LZCNT16rm, 0 },
	{ X86::LZCNT32rr, X86::LZCNT32rm, 0 },
	{ X86::LZCNT64rr, X86::LZCNT64rm, 0 },
	{ X86::POPCNT16rr, X86::POPCNT16rm, 0 },
	{ X86::POPCNT32rr, X86::POPCNT32rm, 0 },
	{ X86::POPCNT64rr, X86::POPCNT64rm, 0 },
	{ X86::RORX32ri, X86::RORX32mi, 0 },
	{ X86::RORX64ri, X86::RORX64mi, 0 },
	{ X86::SARX32rr, X86::SARX32rm, 0 },
	{ X86::SARX64rr, X86::SARX64rm, 0 },
	{ X86::SHRX32rr, X86::SHRX32rm, 0 },
	{ X86::SHRX64rr, X86::SHRX64rm, 0 },
	{ X86::SHLX32rr, X86::SHLX32rm, 0 },
	{ X86::SHLX64rr, X86::SHLX64rm, 0 },
	{ X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
	{ X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
	{ X86::TZCNT16rr, X86::TZCNT16rm, 0 },
	{ X86::TZCNT32rr, X86::TZCNT32rm, 0 },
	{ X86::TZCNT64rr, X86::TZCNT64rm, 0 },
	{ X86::TZMSK32rr, X86::TZMSK32rm, 0 },
	{ X86::TZMSK64rr, X86::TZMSK64rm, 0 },

	// AVX-512 foldable instructions
	{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
	{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
	{ X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
	{ X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
	{ X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
	{ X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
	{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
	{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
	{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
	{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
	{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
	{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
	{ X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
	{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
	{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
	{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
	{ X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
	{ X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
	{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
	{ X86::VPERMQZri, X86::VPERMQZmi, 0 },
	{ X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
	{ X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
	{ X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
	{ X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
	{ X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
	{ X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
	{ X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
	{ X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
	{ X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
	{ X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
	{ X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
	{ X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
	{ X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },

	// AVX-512 foldable instructions (256-bit versions)
	{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
	{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
	{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
	{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
	{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
	{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
	{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
	{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
	{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
	{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
	{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
	{ X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
	{ X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
	{ X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
	{ X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
	{ X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
	{ X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
	{ X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
	{ X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
	{ X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
	{ X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
	{ X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
	{ X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
	{ X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
	{ X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },

	// AVX-512 foldable instructions (128-bit versions)
	{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
	{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
	{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
	{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
	{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
	{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
	{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
	{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
	{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
	{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
	{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
	{ X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
	{ X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
	{ X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
	{ X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
	{ X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
	{ X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
	{ X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },

	// F16C foldable instructions
	{ X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
	{ X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },

	// AES foldable instructions
	{ X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
	{ X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
	{ X86::VAESIMCrr, X86::VAESIMCrm, 0 },
	{ X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
	AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp,
	// Index 1, folded load
	Entry.Flags \| TB_INDEX_1 \| TB_FOLDED_LOAD);
	}

	static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
	{ X86::ADC32rr, X86::ADC32rm, 0 },
	{ X86::ADC64rr, X86::ADC64rm, 0 },
	{ X86::ADD16rr, X86::ADD16rm, 0 },
	{ X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
	{ X86::ADD32rr, X86::ADD32rm, 0 },
	{ X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
	{ X86::ADD64rr, X86::ADD64rm, 0 },
	{ X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
	{ X86::ADD8rr, X86::ADD8rm, 0 },
	{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
	{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
	{ X86::ADDSDrr, X86::ADDSDrm, 0 },
	{ X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
	{ X86::ADDSSrr, X86::ADDSSrm, 0 },
	{ X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
	{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
	{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
	{ X86::AND16rr, X86::AND16rm, 0 },
	{ X86::AND32rr, X86::AND32rm, 0 },
	{ X86::AND64rr, X86::AND64rm, 0 },
	{ X86::AND8rr, X86::AND8rm, 0 },
	{ X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
	{ X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
	{ X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
	{ X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
	{ X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
	{ X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
	{ X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
	{ X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
	{ X86::CMOVA16rr, X86::CMOVA16rm, 0 },
	{ X86::CMOVA32rr, X86::CMOVA32rm, 0 },
	{ X86::CMOVA64rr, X86::CMOVA64rm, 0 },
	{ X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
	{ X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
	{ X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
	{ X86::CMOVB16rr, X86::CMOVB16rm, 0 },
	{ X86::CMOVB32rr, X86::CMOVB32rm, 0 },
	{ X86::CMOVB64rr, X86::CMOVB64rm, 0 },
	{ X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
	{ X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
	{ X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
	{ X86::CMOVE16rr, X86::CMOVE16rm, 0 },
	{ X86::CMOVE32rr, X86::CMOVE32rm, 0 },
	{ X86::CMOVE64rr, X86::CMOVE64rm, 0 },
	{ X86::CMOVG16rr, X86::CMOVG16rm, 0 },
	{ X86::CMOVG32rr, X86::CMOVG32rm, 0 },
	{ X86::CMOVG64rr, X86::CMOVG64rm, 0 },
	{ X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
	{ X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
	{ X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
	{ X86::CMOVL16rr, X86::CMOVL16rm, 0 },
	{ X86::CMOVL32rr, X86::CMOVL32rm, 0 },
	{ X86::CMOVL64rr, X86::CMOVL64rm, 0 },
	{ X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
	{ X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
	{ X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
	{ X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
	{ X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
	{ X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
	{ X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
	{ X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
	{ X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
	{ X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
	{ X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
	{ X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
	{ X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
	{ X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
	{ X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
	{ X86::CMOVO16rr, X86::CMOVO16rm, 0 },
	{ X86::CMOVO32rr, X86::CMOVO32rm, 0 },
	{ X86::CMOVO64rr, X86::CMOVO64rm, 0 },
	{ X86::CMOVP16rr, X86::CMOVP16rm, 0 },
	{ X86::CMOVP32rr, X86::CMOVP32rm, 0 },
	{ X86::CMOVP64rr, X86::CMOVP64rm, 0 },
	{ X86::CMOVS16rr, X86::CMOVS16rm, 0 },
	{ X86::CMOVS32rr, X86::CMOVS32rm, 0 },
	{ X86::CMOVS64rr, X86::CMOVS64rm, 0 },
	{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
	{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
	{ X86::CMPSDrr, X86::CMPSDrm, 0 },
	{ X86::CMPSSrr, X86::CMPSSrm, 0 },
	{ X86::CRC32r32r32, X86::CRC32r32m32, 0 },
	{ X86::CRC32r64r64, X86::CRC32r64m64, 0 },
	{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
	{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
	{ X86::DIVSDrr, X86::DIVSDrm, 0 },
	{ X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
	{ X86::DIVSSrr, X86::DIVSSrm, 0 },
	{ X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
	{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
	{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
	{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
	{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
	{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
	{ X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
	{ X86::IMUL16rr, X86::IMUL16rm, 0 },
	{ X86::IMUL32rr, X86::IMUL32rm, 0 },
	{ X86::IMUL64rr, X86::IMUL64rm, 0 },
	{ X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
	{ X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
	{ X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
	{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
	{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
	{ X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
	{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
	{ X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
	{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
	{ X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
	{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
	{ X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
	{ X86::MAXSDrr, X86::MAXSDrm, 0 },
	{ X86::MAXCSDrr, X86::MAXCSDrm, 0 },
	{ X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
	{ X86::MAXSSrr, X86::MAXSSrm, 0 },
	{ X86::MAXCSSrr, X86::MAXCSSrm, 0 },
	{ X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
	{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
	{ X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
	{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
	{ X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
	{ X86::MINSDrr, X86::MINSDrm, 0 },
	{ X86::MINCSDrr, X86::MINCSDrm, 0 },
	{ X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
	{ X86::MINSSrr, X86::MINSSrm, 0 },
	{ X86::MINCSSrr, X86::MINCSSrm, 0 },
	{ X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
	{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
	{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
	{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
	{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
	{ X86::MULSDrr, X86::MULSDrm, 0 },
	{ X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
	{ X86::MULSSrr, X86::MULSSrm, 0 },
	{ X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
	{ X86::OR16rr, X86::OR16rm, 0 },
	{ X86::OR32rr, X86::OR32rm, 0 },
	{ X86::OR64rr, X86::OR64rm, 0 },
	{ X86::OR8rr, X86::OR8rm, 0 },
	{ X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
	{ X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
	{ X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
	{ X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
	{ X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
	{ X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
	{ X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
	{ X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
	{ X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
	{ X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
	{ X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
	{ X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
	{ X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
	{ X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
	{ X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
	{ X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
	{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
	{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
	{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
	{ X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
	{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
	{ X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
	{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
	{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
	{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
	{ X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
	{ X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
	{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
	{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
	{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
	{ X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
	{ X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
	{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
	{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
	{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
	{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
	{ X86::PINSRBrr, X86::PINSRBrm, 0 },
	{ X86::PINSRDrr, X86::PINSRDrm, 0 },
	{ X86::PINSRQrr, X86::PINSRQrm, 0 },
	{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
	{ X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
	{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
	{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
	{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
	{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
	{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
	{ X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
	{ X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
	{ X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
	{ X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
	{ X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
	{ X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
	{ X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
	{ X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
	{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
	{ X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
	{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
	{ X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
	{ X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
	{ X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
	{ X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
	{ X86::PORrr, X86::PORrm, TB_ALIGN_16 },
	{ X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
	{ X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
	{ X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
	{ X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
	{ X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
	{ X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
	{ X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
	{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
	{ X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
	{ X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
	{ X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
	{ X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
	{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
	{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
	{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
	{ X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
	{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
	{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
	{ X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
	{ X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
	{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
	{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
	{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
	{ X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
	{ X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
	{ X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
	{ X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
	{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
	{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
	{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
	{ X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
	{ X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
	{ X86::SBB32rr, X86::SBB32rm, 0 },
	{ X86::SBB64rr, X86::SBB64rm, 0 },
	{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
	{ X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
	{ X86::SUB16rr, X86::SUB16rm, 0 },
	{ X86::SUB32rr, X86::SUB32rm, 0 },
	{ X86::SUB64rr, X86::SUB64rm, 0 },
	{ X86::SUB8rr, X86::SUB8rm, 0 },
	{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
	{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
	{ X86::SUBSDrr, X86::SUBSDrm, 0 },
	{ X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
	{ X86::SUBSSrr, X86::SUBSSrm, 0 },
	{ X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
	// FIXME: TESTrr -> swapped operand of TESTmr.
	{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
	{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
	{ X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
	{ X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
	{ X86::XOR16rr, X86::XOR16rm, 0 },
	{ X86::XOR32rr, X86::XOR32rm, 0 },
	{ X86::XOR64rr, X86::XOR64rm, 0 },
	{ X86::XOR8rr, X86::XOR8rm, 0 },
	{ X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
	{ X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },

	// MMX version of foldable instructions
	{ X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
	{ X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
	{ X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
	{ X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
	{ X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
	{ X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
	{ X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
	{ X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
	{ X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
	{ X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
	{ X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
	{ X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
	{ X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
	{ X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
	{ X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
	{ X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
	{ X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
	{ X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
	{ X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
	{ X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
	{ X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
	{ X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
	{ X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
	{ X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
	{ X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
	{ X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
	{ X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
	{ X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
	{ X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
	{ X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
	{ X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
	{ X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
	{ X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
	{ X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
	{ X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
	{ X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
	{ X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
	{ X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
	{ X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
	{ X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
	{ X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
	{ X86::MMX_PORirr, X86::MMX_PORirm, 0 },
	{ X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
	{ X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
	{ X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
	{ X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
	{ X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
	{ X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
	{ X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
	{ X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
	{ X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
	{ X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
	{ X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
	{ X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
	{ X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
	{ X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
	{ X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
	{ X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
	{ X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
	{ X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
	{ X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
	{ X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
	{ X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
	{ X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
	{ X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
	{ X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
	{ X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
	{ X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
	{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
	{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },

	// 3DNow! version of foldable instructions
	{ X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
	{ X86::PFACCrr, X86::PFACCrm, 0 },
	{ X86::PFADDrr, X86::PFADDrm, 0 },
	{ X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
	{ X86::PFCMPGErr, X86::PFCMPGErm, 0 },
	{ X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
	{ X86::PFMAXrr, X86::PFMAXrm, 0 },
	{ X86::PFMINrr, X86::PFMINrm, 0 },
	{ X86::PFMULrr, X86::PFMULrm, 0 },
	{ X86::PFNACCrr, X86::PFNACCrm, 0 },
	{ X86::PFPNACCrr, X86::PFPNACCrm, 0 },
	{ X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
	{ X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
	{ X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
	{ X86::PFSUBrr, X86::PFSUBrm, 0 },
	{ X86::PFSUBRrr, X86::PFSUBRrm, 0 },
	{ X86::PMULHRWrr, X86::PMULHRWrm, 0 },

	// AVX 128-bit versions of foldable instructions
	{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
	{ X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE },
	{ X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
	{ X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
	{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
	{ X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
	{ X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
	{ X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
	{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
	{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
	{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
	{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE },
	{ X86::VADDPDrr, X86::VADDPDrm, 0 },
	{ X86::VADDPSrr, X86::VADDPSrm, 0 },
	{ X86::VADDSDrr, X86::VADDSDrm, 0 },
	{ X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
	{ X86::VADDSSrr, X86::VADDSSrm, 0 },
	{ X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
	{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
	{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
	{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
	{ X86::VANDNPSrr, X86::VANDNPSrm, 0 },
	{ X86::VANDPDrr, X86::VANDPDrm, 0 },
	{ X86::VANDPSrr, X86::VANDPSrm, 0 },
	{ X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
	{ X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
	{ X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
	{ X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
	{ X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
	{ X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
	{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
	{ X86::VCMPSSrr, X86::VCMPSSrm, 0 },
	{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
	{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
	{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
	{ X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
	{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
	{ X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
	{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
	{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
	{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
	{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
	{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
	{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
	{ X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
	{ X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
	{ X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
	{ X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
	{ X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
	{ X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
	{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
	{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
	{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
	{ X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
	{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
	{ X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
	{ X86::VMINCPDrr, X86::VMINCPDrm, 0 },
	{ X86::VMINCPSrr, X86::VMINCPSrm, 0 },
	{ X86::VMINCSDrr, X86::VMINCSDrm, 0 },
	{ X86::VMINCSSrr, X86::VMINCSSrm, 0 },
	{ X86::VMINPDrr, X86::VMINPDrm, 0 },
	{ X86::VMINPSrr, X86::VMINPSrm, 0 },
	{ X86::VMINSDrr, X86::VMINSDrm, 0 },
	{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
	{ X86::VMINSSrr, X86::VMINSSrm, 0 },
	{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
	{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
	{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
	{ X86::VMULPDrr, X86::VMULPDrm, 0 },
	{ X86::VMULPSrr, X86::VMULPSrm, 0 },
	{ X86::VMULSDrr, X86::VMULSDrm, 0 },
	{ X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
	{ X86::VMULSSrr, X86::VMULSSrm, 0 },
	{ X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
	{ X86::VORPDrr, X86::VORPDrm, 0 },
	{ X86::VORPSrr, X86::VORPSrm, 0 },
	{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
	{ X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
	{ X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
	{ X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
	{ X86::VPADDBrr, X86::VPADDBrm, 0 },
	{ X86::VPADDDrr, X86::VPADDDrm, 0 },
	{ X86::VPADDQrr, X86::VPADDQrm, 0 },
	{ X86::VPADDSBrr, X86::VPADDSBrm, 0 },
	{ X86::VPADDSWrr, X86::VPADDSWrm, 0 },
	{ X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
	{ X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
	{ X86::VPADDWrr, X86::VPADDWrm, 0 },
	{ X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
	{ X86::VPANDNrr, X86::VPANDNrm, 0 },
	{ X86::VPANDrr, X86::VPANDrm, 0 },
	{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },
	{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },
	{ X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
	{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
	{ X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
	{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
	{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
	{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
	{ X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
	{ X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
	{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
	{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
	{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
	{ X86::VPHADDDrr, X86::VPHADDDrm, 0 },
	{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
	{ X86::VPHADDWrr, X86::VPHADDWrm, 0 },
	{ X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
	{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
	{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
	{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
	{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
	{ X86::VPINSRBrr, X86::VPINSRBrm, 0 },
	{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
	{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
	{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
	{ X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
	{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
	{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
	{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
	{ X86::VPMINSWrr, X86::VPMINSWrm, 0 },
	{ X86::VPMINUBrr, X86::VPMINUBrm, 0 },
	{ X86::VPMINSBrr, X86::VPMINSBrm, 0 },
	{ X86::VPMINSDrr, X86::VPMINSDrm, 0 },
	{ X86::VPMINUDrr, X86::VPMINUDrm, 0 },
	{ X86::VPMINUWrr, X86::VPMINUWrm, 0 },
	{ X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
	{ X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
	{ X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
	{ X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
	{ X86::VPMULDQrr, X86::VPMULDQrm, 0 },
	{ X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
	{ X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
	{ X86::VPMULHWrr, X86::VPMULHWrm, 0 },
	{ X86::VPMULLDrr, X86::VPMULLDrm, 0 },
	{ X86::VPMULLWrr, X86::VPMULLWrm, 0 },
	{ X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
	{ X86::VPORrr, X86::VPORrm, 0 },
	{ X86::VPSADBWrr, X86::VPSADBWrm, 0 },
	{ X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
	{ X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 },
	{ X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 },
	{ X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 },
	{ X86::VPSLLDrr, X86::VPSLLDrm, 0 },
	{ X86::VPSLLQrr, X86::VPSLLQrm, 0 },
	{ X86::VPSLLWrr, X86::VPSLLWrm, 0 },
	{ X86::VPSRADrr, X86::VPSRADrm, 0 },
	{ X86::VPSRAWrr, X86::VPSRAWrm, 0 },
	{ X86::VPSRLDrr, X86::VPSRLDrm, 0 },
	{ X86::VPSRLQrr, X86::VPSRLQrm, 0 },
	{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },
	{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },
	{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },
	{ X86::VPSUBQrr, X86::VPSUBQrm, 0 },
	{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
	{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
	{ X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
	{ X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
	{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },
	{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
	{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
	{ X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
	{ X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
	{ X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
	{ X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
	{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
	{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
	{ X86::VPXORrr, X86::VPXORrm, 0 },
	{ X86::VRCPSSr, X86::VRCPSSm, 0 },
	{ X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
	{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
	{ X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
	{ X86::VROUNDSDr, X86::VROUNDSDm, 0 },
	{ X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
	{ X86::VROUNDSSr, X86::VROUNDSSm, 0 },
	{ X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
	{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
	{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
	{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
	{ X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
	{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
	{ X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
	{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
	{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
	{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
	{ X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
	{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
	{ X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
	{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
	{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
	{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
	{ X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
	{ X86::VXORPDrr, X86::VXORPDrm, 0 },
	{ X86::VXORPSrr, X86::VXORPSrm, 0 },

	// AVX 256-bit foldable instructions
	{ X86::VADDPDYrr, X86::VADDPDYrm, 0 },
	{ X86::VADDPSYrr, X86::VADDPSYrm, 0 },
	{ X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
	{ X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
	{ X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
	{ X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
	{ X86::VANDPDYrr, X86::VANDPDYrm, 0 },
	{ X86::VANDPSYrr, X86::VANDPSYrm, 0 },
	{ X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
	{ X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
	{ X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
	{ X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
	{ X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
	{ X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
	{ X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
	{ X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
	{ X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
	{ X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
	{ X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
	{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
	{ X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
	{ X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
	{ X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
	{ X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
	{ X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
	{ X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
	{ X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
	{ X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
	{ X86::VMINPDYrr, X86::VMINPDYrm, 0 },
	{ X86::VMINPSYrr, X86::VMINPSYrm, 0 },
	{ X86::VMULPDYrr, X86::VMULPDYrm, 0 },
	{ X86::VMULPSYrr, X86::VMULPSYrm, 0 },
	{ X86::VORPDYrr, X86::VORPDYrm, 0 },
	{ X86::VORPSYrr, X86::VORPSYrm, 0 },
	{ X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
	{ X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
	{ X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
	{ X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
	{ X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
	{ X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
	{ X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
	{ X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
	{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
	{ X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
	{ X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
	{ X86::VXORPDYrr, X86::VXORPDYrm, 0 },
	{ X86::VXORPSYrr, X86::VXORPSYrm, 0 },

	// AVX2 foldable instructions
	{ X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
	{ X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
	{ X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
	{ X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
	{ X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
	{ X86::VPADDBYrr, X86::VPADDBYrm, 0 },
	{ X86::VPADDDYrr, X86::VPADDDYrm, 0 },
	{ X86::VPADDQYrr, X86::VPADDQYrm, 0 },
	{ X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
	{ X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
	{ X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
	{ X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
	{ X86::VPADDWYrr, X86::VPADDWYrm, 0 },
	{ X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
	{ X86::VPANDNYrr, X86::VPANDNYrm, 0 },
	{ X86::VPANDYrr, X86::VPANDYrm, 0 },
	{ X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
	{ X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
	{ X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
	{ X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
	{ X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
	{ X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
	{ X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
	{ X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
	{ X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
	{ X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
	{ X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
	{ X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
	{ X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
	{ X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
	{ X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
	{ X86::VPERMDYrr, X86::VPERMDYrm, 0 },
	{ X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
	{ X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
	{ X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
	{ X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
	{ X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
	{ X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
	{ X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
	{ X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
	{ X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
	{ X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
	{ X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
	{ X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
	{ X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
	{ X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
	{ X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
	{ X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
	{ X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
	{ X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
	{ X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
	{ X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
	{ X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
	{ X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
	{ X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
	{ X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
	{ X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
	{ X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
	{ X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
	{ X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
	{ X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
	{ X86::VPORYrr, X86::VPORYrm, 0 },
	{ X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
	{ X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
	{ X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 },
	{ X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 },
	{ X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 },
	{ X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
	{ X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
	{ X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
	{ X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
	{ X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
	{ X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
	{ X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
	{ X86::VPSRADYrr, X86::VPSRADYrm, 0 },
	{ X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
	{ X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
	{ X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
	{ X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
	{ X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
	{ X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
	{ X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
	{ X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
	{ X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
	{ X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
	{ X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
	{ X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
	{ X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
	{ X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
	{ X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
	{ X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
	{ X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
	{ X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
	{ X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
	{ X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
	{ X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
	{ X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
	{ X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
	{ X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
	{ X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
	{ X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
	{ X86::VPXORYrr, X86::VPXORYrm, 0 },

	// FMA4 foldable patterns
	{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
	{ X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
	{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
	{ X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
	{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
	{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
	{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
	{ X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
	{ X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
	{ X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
	{ X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
	{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
	{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
	{ X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
	{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
	{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
	{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
	{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
	{ X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
	{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
	{ X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
	{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
	{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
	{ X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },

	// XOP foldable instructions
	{ X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
	{ X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 },
	{ X86::VPCOMBri, X86::VPCOMBmi, 0 },
	{ X86::VPCOMDri, X86::VPCOMDmi, 0 },
	{ X86::VPCOMQri, X86::VPCOMQmi, 0 },
	{ X86::VPCOMWri, X86::VPCOMWmi, 0 },
	{ X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
	{ X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
	{ X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
	{ X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
	{ X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
	{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 },
	{ X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
	{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 },
	{ X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
	{ X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
	{ X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
	{ X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
	{ X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
	{ X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
	{ X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
	{ X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
	{ X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
	{ X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
	{ X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
	{ X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
	{ X86::VPPERMrrr, X86::VPPERMrmr, 0 },
	{ X86::VPROTBrr, X86::VPROTBrm, 0 },
	{ X86::VPROTDrr, X86::VPROTDrm, 0 },
	{ X86::VPROTQrr, X86::VPROTQrm, 0 },
	{ X86::VPROTWrr, X86::VPROTWrm, 0 },
	{ X86::VPSHABrr, X86::VPSHABrm, 0 },
	{ X86::VPSHADrr, X86::VPSHADrm, 0 },
	{ X86::VPSHAQrr, X86::VPSHAQrm, 0 },
	{ X86::VPSHAWrr, X86::VPSHAWrm, 0 },
	{ X86::VPSHLBrr, X86::VPSHLBrm, 0 },
	{ X86::VPSHLDrr, X86::VPSHLDrm, 0 },
	{ X86::VPSHLQrr, X86::VPSHLQrm, 0 },
	{ X86::VPSHLWrr, X86::VPSHLWrm, 0 },

	// BMI/BMI2 foldable instructions
	{ X86::ANDN32rr, X86::ANDN32rm, 0 },
	{ X86::ANDN64rr, X86::ANDN64rm, 0 },
	{ X86::MULX32rr, X86::MULX32rm, 0 },
	{ X86::MULX64rr, X86::MULX64rm, 0 },
	{ X86::PDEP32rr, X86::PDEP32rm, 0 },
	{ X86::PDEP64rr, X86::PDEP64rm, 0 },
	{ X86::PEXT32rr, X86::PEXT32rm, 0 },
	{ X86::PEXT64rr, X86::PEXT64rm, 0 },

	// ADX foldable instructions
	{ X86::ADCX32rr, X86::ADCX32rm, 0 },
	{ X86::ADCX64rr, X86::ADCX64rm, 0 },
	{ X86::ADOX32rr, X86::ADOX32rm, 0 },
	{ X86::ADOX64rr, X86::ADOX64rm, 0 },

	// AVX-512 foldable instructions
	{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
	{ X86::VADDPSZrr, X86::VADDPSZrm, 0 },
	{ X86::VADDSDZrr, X86::VADDSDZrm, 0 },
	{ X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
	{ X86::VADDSSZrr, X86::VADDSSZrm, 0 },
	{ X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
	{ X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
	{ X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
	{ X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
	{ X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
	{ X86::VANDPDZrr, X86::VANDPDZrm, 0 },
	{ X86::VANDPSZrr, X86::VANDPSZrm, 0 },
	{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
	{ X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
	{ X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
	{ X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
	{ X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
	{ X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
	{ X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
	{ X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
	{ X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
	{ X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
	{ X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
	{ X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
	{ X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
	{ X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
	{ X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
	{ X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
	{ X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
	{ X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
	{ X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
	{ X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
	{ X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
	{ X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
	{ X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
	{ X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
	{ X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
	{ X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
	{ X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
	{ X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
	{ X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
	{ X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
	{ X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
	{ X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
	{ X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
	{ X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
	{ X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
	{ X86::VMINPDZrr, X86::VMINPDZrm, 0 },
	{ X86::VMINPSZrr, X86::VMINPSZrm, 0 },
	{ X86::VMINSDZrr, X86::VMINSDZrm, 0 },
	{ X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
	{ X86::VMINSSZrr, X86::VMINSSZrm, 0 },
	{ X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
	{ X86::VMULPDZrr, X86::VMULPDZrm, 0 },
	{ X86::VMULPSZrr, X86::VMULPSZrm, 0 },
	{ X86::VMULSDZrr, X86::VMULSDZrm, 0 },
	{ X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
	{ X86::VMULSSZrr, X86::VMULSSZrm, 0 },
	{ X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
	{ X86::VORPDZrr, X86::VORPDZrm, 0 },
	{ X86::VORPSZrr, X86::VORPSZrm, 0 },
	{ X86::VPADDBZrr, X86::VPADDBZrm, 0 },
	{ X86::VPADDDZrr, X86::VPADDDZrm, 0 },
	{ X86::VPADDQZrr, X86::VPADDQZrm, 0 },
	{ X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
	{ X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
	{ X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
	{ X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
	{ X86::VPADDWZrr, X86::VPADDWZrm, 0 },
	{ X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
	{ X86::VPANDDZrr, X86::VPANDDZrm, 0 },
	{ X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
	{ X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
	{ X86::VPANDQZrr, X86::VPANDQZrm, 0 },
	{ X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
	{ X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
	{ X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
	{ X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
	{ X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
	{ X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
	{ X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
	{ X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
	{ X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
	{ X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
	{ X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
	{ X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
	{ X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
	{ X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
	{ X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
	{ X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
	{ X86::VPERMBZrr, X86::VPERMBZrm, 0 },
	{ X86::VPERMDZrr, X86::VPERMDZrm, 0 },
	{ X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
	{ X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
	{ X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
	{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
	{ X86::VPERMQZrr, X86::VPERMQZrm, 0 },
	{ X86::VPERMWZrr, X86::VPERMWZrm, 0 },
	{ X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
	{ X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
	{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
	{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
	{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
	{ X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
	{ X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
	{ X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
	{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
	{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
	{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
	{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
	{ X86::VPORDZrr, X86::VPORDZrm, 0 },
	{ X86::VPORQZrr, X86::VPORQZrm, 0 },
	{ X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
	{ X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
	{ X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
	{ X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
	{ X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
	{ X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
	{ X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
	{ X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
	{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
	{ X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
	{ X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
	{ X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
	{ X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
	{ X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
	{ X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
	{ X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
	{ X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
	{ X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
	{ X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
	{ X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
	{ X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
	{ X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
	{ X86::VPXORDZrr, X86::VPXORDZrm, 0 },
	{ X86::VPXORQZrr, X86::VPXORQZrm, 0 },
	{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
	{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
	{ X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
	{ X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
	{ X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
	{ X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
	{ X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
	{ X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
	{ X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
	{ X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
	{ X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
	{ X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
	{ X86::VXORPDZrr, X86::VXORPDZrm, 0 },
	{ X86::VXORPSZrr, X86::VXORPSZrm, 0 },

	// AVX-512{F,VL} foldable instructions
	{ X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
	{ X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
	{ X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
	{ X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
	{ X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
	{ X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
	{ X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
	{ X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
	{ X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
	{ X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
	{ X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
	{ X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
	{ X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
	{ X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
	{ X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
	{ X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
	{ X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
	{ X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
	{ X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
	{ X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
	{ X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
	{ X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
	{ X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
	{ X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
	{ X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
	{ X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
	{ X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
	{ X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
	{ X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
	{ X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
	{ X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
	{ X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
	{ X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
	{ X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
	{ X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
	{ X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
	{ X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
	{ X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
	{ X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
	{ X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
	{ X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
	{ X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
	{ X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
	{ X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
	{ X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
	{ X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
	{ X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
	{ X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
	{ X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
	{ X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
	{ X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
	{ X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
	{ X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
	{ X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
	{ X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
	{ X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
	{ X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
	{ X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
	{ X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
	{ X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
	{ X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
	{ X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
	{ X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
	{ X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
	{ X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
	{ X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
	{ X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
	{ X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
	{ X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
	{ X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
	{ X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
	{ X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
	{ X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
	{ X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
	{ X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
	{ X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
	{ X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
	{ X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
	{ X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
	{ X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
	{ X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
	{ X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
	{ X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
	{ X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
	{ X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
	{ X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
	{ X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
	{ X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
	{ X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
	{ X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
	{ X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
	{ X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
	{ X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
	{ X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
	{ X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
	{ X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
	{ X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
	{ X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
	{ X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
	{ X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
	{ X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
	{ X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
	{ X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
	{ X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
	{ X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
	{ X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
	{ X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
	{ X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
	{ X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
	{ X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
	{ X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
	{ X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
	{ X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
	{ X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
	{ X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
	{ X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
	{ X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
	{ X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
	{ X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
	{ X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
	{ X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
	{ X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
	{ X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
	{ X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
	{ X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
	{ X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
	{ X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
	{ X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
	{ X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
	{ X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
	{ X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
	{ X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
	{ X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
	{ X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
	{ X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
	{ X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
	{ X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
	{ X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
	{ X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
	{ X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
	{ X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
	{ X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
	{ X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
	{ X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
	{ X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
	{ X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
	{ X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
	{ X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
	{ X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
	{ X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
	{ X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
	{ X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
	{ X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
	{ X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
	{ X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
	{ X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
	{ X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
	{ X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
	{ X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
	{ X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
	{ X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
	{ X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
	{ X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
	{ X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
	{ X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
	{ X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
	{ X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
	{ X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
	{ X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
	{ X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
	{ X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
	{ X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
	{ X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
	{ X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
	{ X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
	{ X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
	{ X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
	{ X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
	{ X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
	{ X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
	{ X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
	{ X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
	{ X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
	{ X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
	{ X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },

	// AVX-512 masked foldable instructions
	{ X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
	{ X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
	{ X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
	{ X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
	{ X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
	{ X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
	{ X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
	{ X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
	{ X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
	{ X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
	{ X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
	{ X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
	{ X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
	{ X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
	{ X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
	{ X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
	{ X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },

	// AVX-512VL 256-bit masked foldable instructions
	{ X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
	{ X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
	{ X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
	{ X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
	{ X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
	{ X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
	{ X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
	{ X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
	{ X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
	{ X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
	{ X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
	{ X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
	{ X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
	{ X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },

	// AVX-512VL 128-bit masked foldable instructions
	{ X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
	{ X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
	{ X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
	{ X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
	{ X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
	{ X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
	{ X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },

	// AES foldable instructions
	{ X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
	{ X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
	{ X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
	{ X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
	{ X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
	{ X86::VAESDECrr, X86::VAESDECrm, 0 },
	{ X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
	{ X86::VAESENCrr, X86::VAESENCrm, 0 },

	// SHA foldable instructions
	{ X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
	{ X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
	{ X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
	{ X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
	{ X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
	{ X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
	{ X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
	AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp,
	// Index 2, folded load
	Entry.Flags \| TB_INDEX_2 \| TB_FOLDED_LOAD);
	}

	static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
	// FMA4 foldable patterns
	{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
	{ X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
	{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
	{ X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
	{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
	{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
	{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
	{ X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
	{ X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
	{ X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
	{ X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
	{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
	{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
	{ X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
	{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
	{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
	{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
	{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
	{ X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
	{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
	{ X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
	{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
	{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
	{ X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
	{ X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },

	// XOP foldable instructions
	{ X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
	{ X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 },
	{ X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
	{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 },
	{ X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
	{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
	{ X86::VPPERMrrr, X86::VPPERMrrm, 0 },

	// AVX-512 instructions with 3 source operands.
	{ X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
	{ X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
	{ X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
	{ X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
	{ X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
	{ X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
	{ X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
	{ X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
	{ X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
	{ X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
	{ X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
	{ X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
	{ X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
	{ X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
	{ X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
	{ X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
	{ X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
	{ X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
	{ X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
	{ X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
	{ X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },

	// AVX-512VL 256-bit instructions with 3 source operands.
	{ X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
	{ X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
	{ X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
	{ X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
	{ X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
	{ X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
	{ X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
	{ X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
	{ X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
	{ X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
	{ X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
	{ X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
	{ X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
	{ X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },

	// AVX-512VL 128-bit instructions with 3 source operands.
	{ X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
	{ X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
	{ X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
	{ X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
	{ X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
	{ X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
	{ X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
	{ X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
	{ X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
	{ X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
	{ X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
	{ X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
	{ X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
	{ X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },

	// AVX-512 masked instructions
	{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
	{ X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
	{ X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
	{ X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
	{ X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
	{ X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
	{ X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
	{ X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
	{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
	{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
	{ X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
	{ X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
	{ X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
	{ X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
	{ X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
	{ X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
	{ X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
	{ X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
	{ X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
	{ X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
	{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
	{ X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
	{ X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
	{ X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
	{ X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
	{ X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
	{ X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
	{ X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
	{ X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
	{ X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
	{ X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
	{ X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
	{ X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
	{ X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
	{ X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
	{ X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
	{ X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
	{ X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
	{ X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
	{ X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
	{ X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
	{ X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
	{ X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
	{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
	{ X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
	{ X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
	{ X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
	{ X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
	{ X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
	{ X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
	{ X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
	{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
	{ X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
	{ X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
	{ X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
	{ X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
	{ X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
	{ X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
	{ X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
	{ X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
	{ X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
	{ X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
	{ X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
	{ X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
	{ X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
	{ X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
	{ X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
	{ X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
	{ X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
	{ X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
	{ X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
	{ X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
	{ X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
	{ X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
	{ X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
	{ X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
	{ X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
	{ X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
	{ X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
	{ X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
	{ X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
	{ X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },

	// AVX-512{F,VL} masked arithmetic instructions 256-bit
	{ X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
	{ X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
	{ X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
	{ X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
	{ X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
	{ X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
	{ X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
	{ X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
	{ X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
	{ X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
	{ X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
	{ X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
	{ X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
	{ X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
	{ X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
	{ X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
	{ X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
	{ X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
	{ X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
	{ X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
	{ X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
	{ X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
	{ X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
	{ X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
	{ X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
	{ X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
	{ X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
	{ X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
	{ X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
	{ X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
	{ X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
	{ X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
	{ X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
	{ X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
	{ X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
	{ X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
	{ X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
	{ X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
	{ X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
	{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
	{ X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
	{ X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
	{ X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
	{ X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
	{ X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
	{ X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
	{ X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
	{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
	{ X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
	{ X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
	{ X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
	{ X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
	{ X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
	{ X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
	{ X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
	{ X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
	{ X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
	{ X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
	{ X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
	{ X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
	{ X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
	{ X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
	{ X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
	{ X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
	{ X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
	{ X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
	{ X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
	{ X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
	{ X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
	{ X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
	{ X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
	{ X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
	{ X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
	{ X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
	{ X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
	{ X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
	{ X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
	{ X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },

	// AVX-512{F,VL} masked arithmetic instructions 128-bit
	{ X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
	{ X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
	{ X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
	{ X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
	{ X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
	{ X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
	{ X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
	{ X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
	{ X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
	{ X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
	{ X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
	{ X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
	{ X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
	{ X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
	{ X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
	{ X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
	{ X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
	{ X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
	{ X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
	{ X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
	{ X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
	{ X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
	{ X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
	{ X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
	{ X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
	{ X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
	{ X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
	{ X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
	{ X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
	{ X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
	{ X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
	{ X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
	{ X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
	{ X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
	{ X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
	{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
	{ X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
	{ X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
	{ X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
	{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
	{ X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
	{ X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
	{ X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
	{ X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
	{ X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
	{ X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
	{ X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
	{ X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
	{ X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
	{ X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
	{ X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
	{ X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
	{ X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
	{ X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
	{ X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
	{ X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
	{ X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
	{ X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
	{ X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
	{ X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
	{ X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
	{ X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
	{ X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
	{ X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
	{ X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
	{ X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
	{ X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
	{ X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
	{ X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
	{ X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },

	// AVX-512 masked foldable instructions
	{ X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
	{ X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
	{ X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
	{ X86::VPERMQZrik, X86::VPERMQZmik, 0 },
	{ X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
	{ X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
	{ X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
	{ X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
	{ X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
	{ X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
	{ X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
	{ X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
	{ X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
	{ X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
	{ X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
	{ X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
	{ X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },

	// AVX-512VL 256-bit masked foldable instructions
	{ X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
	{ X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
	{ X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
	{ X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
	{ X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
	{ X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
	{ X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
	{ X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
	{ X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
	{ X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
	{ X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
	{ X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
	{ X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
	{ X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },

	// AVX-512VL 128-bit masked foldable instructions
	{ X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
	{ X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
	{ X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
	{ X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
	{ X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
	{ X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
	{ X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
	AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp,
	// Index 3, folded load
	Entry.Flags \| TB_INDEX_3 \| TB_FOLDED_LOAD);
	}
	auto I = X86InstrFMA3Info::rm_begin();
	auto E = X86InstrFMA3Info::rm_end();
	for (; I != E; ++I) {
	if (!I.getGroup()->isKMasked()) {
	// Intrinsic forms need to pass TB_NO_REVERSE.
	if (I.getGroup()->isIntrinsic()) {
	AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
	I.getRegOpcode(), I.getMemOpcode(),
	TB_ALIGN_NONE \| TB_INDEX_3 \| TB_FOLDED_LOAD \| TB_NO_REVERSE);
	} else {
	AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
	I.getRegOpcode(), I.getMemOpcode(),
	TB_ALIGN_NONE \| TB_INDEX_3 \| TB_FOLDED_LOAD);
	}
	}
	}

	static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
	// AVX-512 foldable masked instructions
	{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
	{ X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
	{ X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
	{ X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
	{ X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
	{ X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
	{ X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
	{ X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
	{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
	{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
	{ X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
	{ X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
	{ X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
	{ X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
	{ X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
	{ X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
	{ X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
	{ X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
	{ X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
	{ X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
	{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
	{ X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
	{ X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
	{ X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
	{ X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
	{ X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
	{ X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
	{ X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
	{ X86::VORPDZrrk, X86::VORPDZrmk, 0 },
	{ X86::VORPSZrrk, X86::VORPSZrmk, 0 },
	{ X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
	{ X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
	{ X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
	{ X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
	{ X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
	{ X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
	{ X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
	{ X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
	{ X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
	{ X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
	{ X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
	{ X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
	{ X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
	{ X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
	{ X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
	{ X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
	{ X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
	{ X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
	{ X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
	{ X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
	{ X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
	{ X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
	{ X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
	{ X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
	{ X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
	{ X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
	{ X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
	{ X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
	{ X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
	{ X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
	{ X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
	{ X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
	{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
	{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
	{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
	{ X86::VPORDZrrk, X86::VPORDZrmk, 0 },
	{ X86::VPORQZrrk, X86::VPORQZrmk, 0 },
	{ X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
	{ X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
	{ X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
	{ X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
	{ X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
	{ X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
	{ X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
	{ X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
	{ X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
	{ X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
	{ X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
	{ X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
	{ X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
	{ X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
	{ X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
	{ X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
	{ X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
	{ X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
	{ X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
	{ X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
	{ X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
	{ X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
	{ X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
	{ X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
	{ X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
	{ X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
	{ X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
	{ X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },

	// AVX-512{F,VL} foldable masked instructions 256-bit
	{ X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
	{ X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
	{ X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
	{ X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
	{ X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
	{ X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
	{ X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
	{ X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
	{ X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
	{ X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
	{ X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
	{ X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
	{ X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
	{ X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
	{ X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
	{ X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
	{ X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
	{ X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
	{ X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
	{ X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
	{ X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
	{ X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
	{ X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
	{ X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
	{ X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
	{ X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
	{ X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
	{ X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
	{ X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
	{ X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
	{ X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
	{ X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
	{ X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
	{ X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
	{ X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
	{ X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
	{ X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
	{ X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
	{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
	{ X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
	{ X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
	{ X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
	{ X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
	{ X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
	{ X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
	{ X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
	{ X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
	{ X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
	{ X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
	{ X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
	{ X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
	{ X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
	{ X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
	{ X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
	{ X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
	{ X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
	{ X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
	{ X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
	{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
	{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
	{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
	{ X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
	{ X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
	{ X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
	{ X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
	{ X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
	{ X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
	{ X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
	{ X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
	{ X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
	{ X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
	{ X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
	{ X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
	{ X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
	{ X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
	{ X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
	{ X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
	{ X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
	{ X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
	{ X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
	{ X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
	{ X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
	{ X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
	{ X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
	{ X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
	{ X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
	{ X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
	{ X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
	{ X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
	{ X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
	{ X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
	{ X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },

	// AVX-512{F,VL} foldable instructions 128-bit
	{ X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
	{ X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
	{ X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
	{ X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
	{ X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
	{ X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
	{ X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
	{ X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
	{ X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
	{ X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
	{ X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
	{ X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
	{ X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
	{ X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
	{ X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
	{ X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
	{ X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
	{ X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
	{ X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
	{ X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
	{ X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
	{ X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
	{ X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
	{ X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
	{ X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
	{ X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
	{ X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
	{ X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
	{ X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
	{ X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
	{ X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
	{ X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
	{ X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
	{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
	{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
	{ X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
	{ X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
	{ X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
	{ X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
	{ X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
	{ X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
	{ X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
	{ X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
	{ X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
	{ X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
	{ X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
	{ X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
	{ X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
	{ X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
	{ X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
	{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
	{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
	{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
	{ X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
	{ X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
	{ X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
	{ X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
	{ X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
	{ X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
	{ X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
	{ X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
	{ X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
	{ X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
	{ X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
	{ X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
	{ X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
	{ X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
	{ X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
	{ X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
	{ X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
	{ X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
	{ X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
	{ X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
	{ X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
	{ X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
	{ X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
	{ X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
	{ X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
	{ X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
	{ X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
	{ X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
	{ X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
	{ X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
	{ X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },

	// 512-bit three source instructions with zero masking.
	{ X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
	{ X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
	{ X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
	{ X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
	{ X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
	{ X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
	{ X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
	{ X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
	{ X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
	{ X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
	{ X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
	{ X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
	{ X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
	{ X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },

	// 256-bit three source instructions with zero masking.
	{ X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
	{ X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
	{ X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
	{ X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
	{ X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
	{ X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
	{ X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
	{ X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
	{ X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
	{ X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
	{ X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
	{ X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
	{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
	{ X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },

	// 128-bit three source instructions with zero masking.
	{ X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
	{ X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
	{ X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
	{ X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
	{ X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
	{ X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
	{ X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
	{ X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
	{ X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
	{ X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
	{ X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
	{ X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
	{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
	{ X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
	};

	for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
	AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
	Entry.RegOp, Entry.MemOp,
	// Index 4, folded load
	Entry.Flags \| TB_INDEX_4 \| TB_FOLDED_LOAD);
	}
	for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
	if (I.getGroup()->isKMasked()) {
	// Intrinsics need to pass TB_NO_REVERSE.
	if (I.getGroup()->isIntrinsic()) {
	AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
	I.getRegOpcode(), I.getMemOpcode(),
	TB_ALIGN_NONE \| TB_INDEX_4 \| TB_FOLDED_LOAD \| TB_NO_REVERSE);
	} else {
	AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
	I.getRegOpcode(), I.getMemOpcode(),
	TB_ALIGN_NONE \| TB_INDEX_4 \| TB_FOLDED_LOAD);
	}
	}
	}
	}

	void
	X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
	MemOp2RegOpTableType &M2RTable,
	uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
	if ((Flags & TB_NO_FORWARD) == 0) {
	assert(!R2MTable.count(RegOp) && "Duplicate entry!");
	R2MTable[RegOp] = std::make_pair(MemOp, Flags);
	}
	if ((Flags & TB_NO_REVERSE) == 0) {
	assert(!M2RTable.count(MemOp) &&
	"Duplicated entries in unfolding maps?");
	M2RTable[MemOp] = std::make_pair(RegOp, Flags);
	}
	}

	bool
	X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
	unsigned &SrcReg, unsigned &DstReg,
	unsigned &SubIdx) const {
	switch (MI.getOpcode()) {
	default: break;
	case X86::MOVSX16rr8:
	case X86::MOVZX16rr8:
	case X86::MOVSX32rr8:
	case X86::MOVZX32rr8:
	case X86::MOVSX64rr8:
	if (!Subtarget.is64Bit())
	// It's not always legal to reference the low 8-bit of the larger
	// register in 32-bit mode.
	return false;
	case X86::MOVSX32rr16:
	case X86::MOVZX32rr16:
	case X86::MOVSX64rr16:
	case X86::MOVSX64rr32: {
	if (MI.getOperand(0).getSubReg() \|\| MI.getOperand(1).getSubReg())
	// Be conservative.
	return false;
	SrcReg = MI.getOperand(1).getReg();
	DstReg = MI.getOperand(0).getReg();
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::MOVSX16rr8:
	case X86::MOVZX16rr8:
	case X86::MOVSX32rr8:
	case X86::MOVZX32rr8:
	case X86::MOVSX64rr8:
	SubIdx = X86::sub_8bit;
	break;
	case X86::MOVSX32rr16:
	case X86::MOVZX32rr16:
	case X86::MOVSX64rr16:
	SubIdx = X86::sub_16bit;
	break;
	case X86::MOVSX64rr32:
	SubIdx = X86::sub_32bit;
	break;
	}
	return true;
	}
	}
	return false;
	}

	int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
	const MachineFunction *MF = MI.getParent()->getParent();
	const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();

	if (MI.getOpcode() == getCallFrameSetupOpcode() \|\|
	MI.getOpcode() == getCallFrameDestroyOpcode()) {
	unsigned StackAlign = TFI->getStackAlignment();
	int SPAdj =
	(MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;

	SPAdj -= MI.getOperand(1).getImm();

	if (MI.getOpcode() == getCallFrameSetupOpcode())
	return SPAdj;
	else
	return -SPAdj;
	}

	// To know whether a call adjusts the stack, we need information
	// that is bound to the following ADJCALLSTACKUP pseudo.
	// Look for the next ADJCALLSTACKUP that follows the call.
	if (MI.isCall()) {
	const MachineBasicBlock *MBB = MI.getParent();
	auto I = ++MachineBasicBlock::const_iterator(MI);
	for (auto E = MBB->end(); I != E; ++I) {
	if (I->getOpcode() == getCallFrameDestroyOpcode() \|\|
	I->isCall())
	break;
	}

	// If we could not find a frame destroy opcode, then it has already
	// been simplified, so we don't care.
	if (I->getOpcode() != getCallFrameDestroyOpcode())
	return 0;

	return -(I->getOperand(1).getImm());
	}

	// Currently handle only PUSHes we can reasonably expect to see
	// in call sequences
	switch (MI.getOpcode()) {
	default:
	return 0;
	case X86::PUSH32i8:
	case X86::PUSH32r:
	case X86::PUSH32rmm:
	case X86::PUSH32rmr:
	case X86::PUSHi32:
	return 4;
	case X86::PUSH64i8:
	case X86::PUSH64r:
	case X86::PUSH64rmm:
	case X86::PUSH64rmr:
	case X86::PUSH64i32:
	return 8;
	}
	}

	/// Return true and the FrameIndex if the specified
	/// operand and follow operands form a reference to the stack frame.
	bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
	int &FrameIndex) const {
	if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
	MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
	MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
	MI.getOperand(Op + X86::AddrDisp).isImm() &&
	MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
	MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
	MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
	FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
	return true;
	}
	return false;
	}

	static bool isFrameLoadOpcode(int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86::MOV8rm:
	case X86::MOV16rm:
	case X86::MOV32rm:
	case X86::MOV64rm:
	case X86::LD_Fp64m:
	case X86::MOVSSrm:
	case X86::MOVSDrm:
	case X86::MOVAPSrm:
	case X86::MOVUPSrm:
	case X86::MOVAPDrm:
	case X86::MOVUPDrm:
	case X86::MOVDQArm:
	case X86::MOVDQUrm:
	case X86::VMOVSSrm:
	case X86::VMOVSDrm:
	case X86::VMOVAPSrm:
	case X86::VMOVUPSrm:
	case X86::VMOVAPDrm:
	case X86::VMOVUPDrm:
	case X86::VMOVDQArm:
	case X86::VMOVDQUrm:
	case X86::VMOVUPSYrm:
	case X86::VMOVAPSYrm:
	case X86::VMOVUPDYrm:
	case X86::VMOVAPDYrm:
	case X86::VMOVDQUYrm:
	case X86::VMOVDQAYrm:
	case X86::MMX_MOVD64rm:
	case X86::MMX_MOVQ64rm:
	case X86::VMOVSSZrm:
	case X86::VMOVSDZrm:
	case X86::VMOVAPSZrm:
	case X86::VMOVAPSZ128rm:
	case X86::VMOVAPSZ256rm:
	case X86::VMOVAPSZ128rm_NOVLX:
	case X86::VMOVAPSZ256rm_NOVLX:
	case X86::VMOVUPSZrm:
	case X86::VMOVUPSZ128rm:
	case X86::VMOVUPSZ256rm:
	case X86::VMOVUPSZ128rm_NOVLX:
	case X86::VMOVUPSZ256rm_NOVLX:
	case X86::VMOVAPDZrm:
	case X86::VMOVAPDZ128rm:
	case X86::VMOVAPDZ256rm:
	case X86::VMOVUPDZrm:
	case X86::VMOVUPDZ128rm:
	case X86::VMOVUPDZ256rm:
	case X86::VMOVDQA32Zrm:
	case X86::VMOVDQA32Z128rm:
	case X86::VMOVDQA32Z256rm:
	case X86::VMOVDQU32Zrm:
	case X86::VMOVDQU32Z128rm:
	case X86::VMOVDQU32Z256rm:
	case X86::VMOVDQA64Zrm:
	case X86::VMOVDQA64Z128rm:
	case X86::VMOVDQA64Z256rm:
	case X86::VMOVDQU64Zrm:
	case X86::VMOVDQU64Z128rm:
	case X86::VMOVDQU64Z256rm:
	case X86::VMOVDQU8Zrm:
	case X86::VMOVDQU8Z128rm:
	case X86::VMOVDQU8Z256rm:
	case X86::VMOVDQU16Zrm:
	case X86::VMOVDQU16Z128rm:
	case X86::VMOVDQU16Z256rm:
	case X86::KMOVBkm:
	case X86::KMOVWkm:
	case X86::KMOVDkm:
	case X86::KMOVQkm:
	return true;
	}
	}

	static bool isFrameStoreOpcode(int Opcode) {
	switch (Opcode) {
	default: break;
	case X86::MOV8mr:
	case X86::MOV16mr:
	case X86::MOV32mr:
	case X86::MOV64mr:
	case X86::ST_FpP64m:
	case X86::MOVSSmr:
	case X86::MOVSDmr:
	case X86::MOVAPSmr:
	case X86::MOVUPSmr:
	case X86::MOVAPDmr:
	case X86::MOVUPDmr:
	case X86::MOVDQAmr:
	case X86::MOVDQUmr:
	case X86::VMOVSSmr:
	case X86::VMOVSDmr:
	case X86::VMOVAPSmr:
	case X86::VMOVUPSmr:
	case X86::VMOVAPDmr:
	case X86::VMOVUPDmr:
	case X86::VMOVDQAmr:
	case X86::VMOVDQUmr:
	case X86::VMOVUPSYmr:
	case X86::VMOVAPSYmr:
	case X86::VMOVUPDYmr:
	case X86::VMOVAPDYmr:
	case X86::VMOVDQUYmr:
	case X86::VMOVDQAYmr:
	case X86::VMOVSSZmr:
	case X86::VMOVSDZmr:
	case X86::VMOVUPSZmr:
	case X86::VMOVUPSZ128mr:
	case X86::VMOVUPSZ256mr:
	case X86::VMOVUPSZ128mr_NOVLX:
	case X86::VMOVUPSZ256mr_NOVLX:
	case X86::VMOVAPSZmr:
	case X86::VMOVAPSZ128mr:
	case X86::VMOVAPSZ256mr:
	case X86::VMOVAPSZ128mr_NOVLX:
	case X86::VMOVAPSZ256mr_NOVLX:
	case X86::VMOVUPDZmr:
	case X86::VMOVUPDZ128mr:
	case X86::VMOVUPDZ256mr:
	case X86::VMOVAPDZmr:
	case X86::VMOVAPDZ128mr:
	case X86::VMOVAPDZ256mr:
	case X86::VMOVDQA32Zmr:
	case X86::VMOVDQA32Z128mr:
	case X86::VMOVDQA32Z256mr:
	case X86::VMOVDQU32Zmr:
	case X86::VMOVDQU32Z128mr:
	case X86::VMOVDQU32Z256mr:
	case X86::VMOVDQA64Zmr:
	case X86::VMOVDQA64Z128mr:
	case X86::VMOVDQA64Z256mr:
	case X86::VMOVDQU64Zmr:
	case X86::VMOVDQU64Z128mr:
	case X86::VMOVDQU64Z256mr:
	case X86::VMOVDQU8Zmr:
	case X86::VMOVDQU8Z128mr:
	case X86::VMOVDQU8Z256mr:
	case X86::VMOVDQU16Zmr:
	case X86::VMOVDQU16Z128mr:
	case X86::VMOVDQU16Z256mr:
	case X86::MMX_MOVD64mr:
	case X86::MMX_MOVQ64mr:
	case X86::MMX_MOVNTQmr:
	case X86::KMOVBmk:
	case X86::KMOVWmk:
	case X86::KMOVDmk:
	case X86::KMOVQmk:
	return true;
	}
	return false;
	}

	unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	if (isFrameLoadOpcode(MI.getOpcode()))
	if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
	return MI.getOperand(0).getReg();
	return 0;
	}

	unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	if (isFrameLoadOpcode(MI.getOpcode())) {
	unsigned Reg;
	if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
	return Reg;
	// Check for post-frame index elimination operations
	const MachineMemOperand *Dummy;
	return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
	}
	return 0;
	}

	unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const {
	if (isFrameStoreOpcode(MI.getOpcode()))
	if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
	isFrameOperand(MI, 0, FrameIndex))
	return MI.getOperand(X86::AddrNumOperands).getReg();
	return 0;
	}

	unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const {
	if (isFrameStoreOpcode(MI.getOpcode())) {
	unsigned Reg;
	if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
	return Reg;
	// Check for post-frame index elimination operations
	const MachineMemOperand *Dummy;
	return hasStoreToStackSlot(MI, Dummy, FrameIndex);
	}
	return 0;
	}

	/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
	static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
	// Don't waste compile time scanning use-def chains of physregs.
	if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
	return false;
	bool isPICBase = false;
	for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
	E = MRI.def_instr_end(); I != E; ++I) {
	MachineInstr DefMI = &I;
	if (DefMI->getOpcode() != X86::MOVPC32r)
	return false;
	assert(!isPICBase && "More than one PIC base?");
	isPICBase = true;
	}
	return isPICBase;
	}

	bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const {
	switch (MI.getOpcode()) {
	default: break;
	case X86::MOV8rm:
	case X86::MOV8rm_NOREX:
	case X86::MOV16rm:
	case X86::MOV32rm:
	case X86::MOV64rm:
	case X86::LD_Fp64m:
	case X86::MOVSSrm:
	case X86::MOVSDrm:
	case X86::MOVAPSrm:
	case X86::MOVUPSrm:
	case X86::MOVAPDrm:
	case X86::MOVUPDrm:
	case X86::MOVDQArm:
	case X86::MOVDQUrm:
	case X86::VMOVSSrm:
	case X86::VMOVSDrm:
	case X86::VMOVAPSrm:
	case X86::VMOVUPSrm:
	case X86::VMOVAPDrm:
	case X86::VMOVUPDrm:
	case X86::VMOVDQArm:
	case X86::VMOVDQUrm:
	case X86::VMOVAPSYrm:
	case X86::VMOVUPSYrm:
	case X86::VMOVAPDYrm:
	case X86::VMOVUPDYrm:
	case X86::VMOVDQAYrm:
	case X86::VMOVDQUYrm:
	case X86::MMX_MOVD64rm:
	case X86::MMX_MOVQ64rm:
	// AVX-512
	case X86::VMOVSSZrm:
	case X86::VMOVSDZrm:
	case X86::VMOVAPDZ128rm:
	case X86::VMOVAPDZ256rm:
	case X86::VMOVAPDZrm:
	case X86::VMOVAPSZ128rm:
	case X86::VMOVAPSZ256rm:
	case X86::VMOVAPSZ128rm_NOVLX:
	case X86::VMOVAPSZ256rm_NOVLX:
	case X86::VMOVAPSZrm:
	case X86::VMOVDQA32Z128rm:
	case X86::VMOVDQA32Z256rm:
	case X86::VMOVDQA32Zrm:
	case X86::VMOVDQA64Z128rm:
	case X86::VMOVDQA64Z256rm:
	case X86::VMOVDQA64Zrm:
	case X86::VMOVDQU16Z128rm:
	case X86::VMOVDQU16Z256rm:
	case X86::VMOVDQU16Zrm:
	case X86::VMOVDQU32Z128rm:
	case X86::VMOVDQU32Z256rm:
	case X86::VMOVDQU32Zrm:
	case X86::VMOVDQU64Z128rm:
	case X86::VMOVDQU64Z256rm:
	case X86::VMOVDQU64Zrm:
	case X86::VMOVDQU8Z128rm:
	case X86::VMOVDQU8Z256rm:
	case X86::VMOVDQU8Zrm:
	case X86::VMOVUPDZ128rm:
	case X86::VMOVUPDZ256rm:
	case X86::VMOVUPDZrm:
	case X86::VMOVUPSZ128rm:
	case X86::VMOVUPSZ256rm:
	case X86::VMOVUPSZ128rm_NOVLX:
	case X86::VMOVUPSZ256rm_NOVLX:
	case X86::VMOVUPSZrm: {
	// Loads from constant pools are trivially rematerializable.
	if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
	MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
	MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
	MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
	MI.isDereferenceableInvariantLoad(AA)) {
	unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
	if (BaseReg == 0 \|\| BaseReg == X86::RIP)
	return true;
	// Allow re-materialization of PIC load.
	if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
	return false;
	const MachineFunction &MF = *MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return regIsPICBase(BaseReg, MRI);
	}
	return false;
	}

	case X86::LEA32r:
	case X86::LEA64r: {
	if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
	MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
	MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
	!MI.getOperand(1 + X86::AddrDisp).isReg()) {
	// lea fi#, lea GV, etc. are all rematerializable.
	if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
	return true;
	unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
	if (BaseReg == 0)
	return true;
	// Allow re-materialization of lea PICBase + x.
	const MachineFunction &MF = *MI.getParent()->getParent();
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return regIsPICBase(BaseReg, MRI);
	}
	return false;
	}
	}

	// All other instructions marked M_REMATERIALIZABLE are always trivially
	// rematerializable.
	return true;
	}

	bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	MachineBasicBlock::iterator E = MBB.end();

	// For compile time consideration, if we are not able to determine the
	// safety after visiting 4 instructions in each direction, we will assume
	// it's not safe.
	MachineBasicBlock::iterator Iter = I;
	for (unsigned i = 0; Iter != E && i < 4; ++i) {
	bool SeenDef = false;
	for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
	MachineOperand &MO = Iter->getOperand(j);
	if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
	SeenDef = true;
	if (!MO.isReg())
	continue;
	if (MO.getReg() == X86::EFLAGS) {
	if (MO.isUse())
	return false;
	SeenDef = true;
	}
	}

	if (SeenDef)
	// This instruction defines EFLAGS, no need to look any further.
	return true;
	++Iter;
	// Skip over DBG_VALUE.
	while (Iter != E && Iter->isDebugValue())
	++Iter;
	}

	// It is safe to clobber EFLAGS at the end of a block of no successor has it
	// live in.
	if (Iter == E) {
	for (MachineBasicBlock *S : MBB.successors())
	if (S->isLiveIn(X86::EFLAGS))
	return false;
	return true;
	}

	MachineBasicBlock::iterator B = MBB.begin();
	Iter = I;
	for (unsigned i = 0; i < 4; ++i) {
	// If we make it to the beginning of the block, it's safe to clobber
	// EFLAGS iff EFLAGS is not live-in.
	if (Iter == B)
	return !MBB.isLiveIn(X86::EFLAGS);

	--Iter;
	// Skip over DBG_VALUE.
	while (Iter != B && Iter->isDebugValue())
	--Iter;

	bool SawKill = false;
	for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
	MachineOperand &MO = Iter->getOperand(j);
	// A register mask may clobber EFLAGS, but we should still look for a
	// live EFLAGS def.
	if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
	SawKill = true;
	if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
	if (MO.isDef()) return MO.isDead();
	if (MO.isKill()) SawKill = true;
	}
	}

	if (SawKill)
	// This instruction kills EFLAGS and doesn't redefine it, so
	// there's no need to look further.
	return true;
	}

	// Conservative answer.
	return false;
	}

	void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	unsigned DestReg, unsigned SubIdx,
	const MachineInstr &Orig,
	const TargetRegisterInfo &TRI) const {
	bool ClobbersEFLAGS = false;
	for (const MachineOperand &MO : Orig.operands()) {
	if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
	ClobbersEFLAGS = true;
	break;
	}
	}

	if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
	// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
	// effects.
	int Value;
	switch (Orig.getOpcode()) {
	case X86::MOV32r0: Value = 0; break;
	case X86::MOV32r1: Value = 1; break;
	case X86::MOV32r_1: Value = -1; break;
	default:
	llvm_unreachable("Unexpected instruction!");
	}

	const DebugLoc &DL = Orig.getDebugLoc();
	BuildMI(MBB, I, DL, get(X86::MOV32ri))
	.addOperand(Orig.getOperand(0))
	.addImm(Value);
	} else {
	MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
	MBB.insert(I, MI);
	}

	MachineInstr &NewMI = *std::prev(I);
	NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
	}

	/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
	bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MI.getOperand(i);
	if (MO.isReg() && MO.isDef() &&
	MO.getReg() == X86::EFLAGS && !MO.isDead()) {
	return true;
	}
	}
	return false;
	}

	/// Check whether the shift count for a machine operand is non-zero.
	inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
	unsigned ShiftAmtOperandIdx) {
	// The shift count is six bits with the REX.W prefix and five bits without.
	unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
	unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
	return Imm & ShiftCountMask;
	}

	/// Check whether the given shift count is appropriate
	/// can be represented by a LEA instruction.
	inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
	// Left shift instructions can be transformed into load-effective-address
	// instructions if we can encode them appropriately.
	// A LEA instruction utilizes a SIB byte to encode its scale factor.
	// The SIB.scale field is two bits wide which means that we can encode any
	// shift amount less than 4.
	return ShAmt < 4 && ShAmt > 0;
	}

	bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
	unsigned Opc, bool AllowSP, unsigned &NewSrc,
	bool &isKill, bool &isUndef,
	MachineOperand &ImplicitOp,
	LiveVariables *LV) const {
	MachineFunction &MF = *MI.getParent()->getParent();
	const TargetRegisterClass *RC;
	if (AllowSP) {
	RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
	} else {
	RC = Opc != X86::LEA32r ?
	&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
	}
	unsigned SrcReg = Src.getReg();

	// For both LEA64 and LEA32 the register already has essentially the right
	// type (32-bit or 64-bit) we may just need to forbid SP.
	if (Opc != X86::LEA64_32r) {
	NewSrc = SrcReg;
	isKill = Src.isKill();
	isUndef = Src.isUndef();

	if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
	!MF.getRegInfo().constrainRegClass(NewSrc, RC))
	return false;

	return true;
	}

	// This is for an LEA64_32r and incoming registers are 32-bit. One way or
	// another we need to add 64-bit registers to the final MI.
	if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
	ImplicitOp = Src;
	ImplicitOp.setImplicit();

	NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
	isKill = Src.isKill();
	isUndef = Src.isUndef();
	} else {
	// Virtual register of the wrong class, we have to create a temporary 64-bit
	// vreg to feed into the LEA.
	NewSrc = MF.getRegInfo().createVirtualRegister(RC);
	MachineInstr Copy = BuildMI(MI.getParent(), MI, MI.getDebugLoc(),
	get(TargetOpcode::COPY))
	.addReg(NewSrc, RegState::Define \| RegState::Undef, X86::sub_32bit)
	.addOperand(Src);

	// Which is obviously going to be dead after we're done with it.
	isKill = true;
	isUndef = false;

	if (LV)
	LV->replaceKillInstruction(SrcReg, MI, *Copy);
	}

	// We've set all the parameters without issue.
	return true;
	}

	/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
	/// LEA to form 3-address code by promoting to a 32-bit superregister and then
	/// truncating back down to a 16-bit subregister.
	MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
	unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
	LiveVariables *LV) const {
	MachineBasicBlock::iterator MBBI = MI.getIterator();
	unsigned Dest = MI.getOperand(0).getReg();
	unsigned Src = MI.getOperand(1).getReg();
	bool isDead = MI.getOperand(0).isDead();
	bool isKill = MI.getOperand(1).isKill();

	MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
	unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
	unsigned Opc, leaInReg;
	if (Subtarget.is64Bit()) {
	Opc = X86::LEA64_32r;
	leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
	} else {
	Opc = X86::LEA32r;
	leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
	}

	// Build and insert into an implicit UNDEF value. This is OK because
	// well be shifting and then extracting the lower 16-bits.
	// This has the potential to cause partial register stall. e.g.
	// movw (%rbp,%rcx,2), %dx
	// leal -65(%rdx), %esi
	// But testing has shown this does help performance in 64-bit mode (at
	// least on modern x86 machines).
	BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
	MachineInstr *InsMI =
	BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
	.addReg(leaInReg, RegState::Define, X86::sub_16bit)
	.addReg(Src, getKillRegState(isKill));

	MachineInstrBuilder MIB =
	BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
	switch (MIOpc) {
	default: llvm_unreachable("Unreachable!");
	case X86::SHL16ri: {
	unsigned ShAmt = MI.getOperand(2).getImm();
	MIB.addReg(0).addImm(1ULL << ShAmt)
	.addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
	break;
	}
	case X86::INC16r:
	addRegOffset(MIB, leaInReg, true, 1);
	break;
	case X86::DEC16r:
	addRegOffset(MIB, leaInReg, true, -1);
	break;
	case X86::ADD16ri:
	case X86::ADD16ri8:
	case X86::ADD16ri_DB:
	case X86::ADD16ri8_DB:
	addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
	break;
	case X86::ADD16rr:
	case X86::ADD16rr_DB: {
	unsigned Src2 = MI.getOperand(2).getReg();
	bool isKill2 = MI.getOperand(2).isKill();
	unsigned leaInReg2 = 0;
	MachineInstr *InsMI2 = nullptr;
	if (Src == Src2) {
	// ADD16rr %reg1028<kill>, %reg1028
	// just a single insert_subreg.
	addRegReg(MIB, leaInReg, true, leaInReg, false);
	} else {
	if (Subtarget.is64Bit())
	leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
	else
	leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
	// Build and insert into an implicit UNDEF value. This is OK because
	// well be shifting and then extracting the lower 16-bits.
	BuildMI(MFI, &MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
	InsMI2 = BuildMI(MFI, &MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
	.addReg(leaInReg2, RegState::Define, X86::sub_16bit)
	.addReg(Src2, getKillRegState(isKill2));
	addRegReg(MIB, leaInReg, true, leaInReg2, true);
	}
	if (LV && isKill2 && InsMI2)
	LV->replaceKillInstruction(Src2, MI, *InsMI2);
	break;
	}
	}

	MachineInstr *NewMI = MIB;
	MachineInstr *ExtMI =
	BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
	.addReg(Dest, RegState::Define \| getDeadRegState(isDead))
	.addReg(leaOutReg, RegState::Kill, X86::sub_16bit);

	if (LV) {
	// Update live variables
	LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
	LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
	if (isKill)
	LV->replaceKillInstruction(Src, MI, *InsMI);
	if (isDead)
	LV->replaceKillInstruction(Dest, MI, *ExtMI);
	}

	return ExtMI;
	}

	/// This method must be implemented by targets that
	/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
	/// may be able to convert a two-address instruction into a true
	/// three-address instruction on demand. This allows the X86 target (for
	/// example) to convert ADD and SHL instructions into LEA instructions if they
	/// would require register copies due to two-addressness.
	///
	/// This method returns a null pointer if the transformation cannot be
	/// performed, otherwise it returns the new instruction.
	///
	MachineInstr *
	X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
	MachineInstr &MI, LiveVariables *LV) const {
	// The following opcodes also sets the condition code register(s). Only
	// convert them to equivalent lea if the condition code register def's
	// are dead!
	if (hasLiveCondCodeDef(MI))
	return nullptr;

	MachineFunction &MF = *MI.getParent()->getParent();
	// All instructions input are two-addr instructions. Get the known operands.
	const MachineOperand &Dest = MI.getOperand(0);
	const MachineOperand &Src = MI.getOperand(1);

	MachineInstr *NewMI = nullptr;
	// FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
	// we have better subtarget support, enable the 16-bit LEA generation here.
	// 16-bit LEA is also slow on Core2.
	bool DisableLEA16 = true;
	bool is64Bit = Subtarget.is64Bit();

	unsigned MIOpc = MI.getOpcode();
	switch (MIOpc) {
	default: return nullptr;
	case X86::SHL64ri: {
	assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
	unsigned ShAmt = getTruncatedShiftCount(MI, 2);
	if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;

	// LEA can't handle RSP.
	if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
	!MF.getRegInfo().constrainRegClass(Src.getReg(),
	&X86::GR64_NOSPRegClass))
	return nullptr;

	NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
	.addOperand(Dest)
	.addReg(0)
	.addImm(1ULL << ShAmt)
	.addOperand(Src)
	.addImm(0)
	.addReg(0);
	break;
	}
	case X86::SHL32ri: {
	assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
	unsigned ShAmt = getTruncatedShiftCount(MI, 2);
	if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;

	unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;

	// LEA can't handle ESP.
	bool isKill, isUndef;
	unsigned SrcReg;
	MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src, Opc, /AllowSP=/ false,
	SrcReg, isKill, isUndef, ImplicitOp, LV))
	return nullptr;

	MachineInstrBuilder MIB =
	BuildMI(MF, MI.getDebugLoc(), get(Opc))
	.addOperand(Dest)
	.addReg(0)
	.addImm(1ULL << ShAmt)
	.addReg(SrcReg, getKillRegState(isKill) \| getUndefRegState(isUndef))
	.addImm(0)
	.addReg(0);
	if (ImplicitOp.getReg() != 0)
	MIB.addOperand(ImplicitOp);
	NewMI = MIB;

	break;
	}
	case X86::SHL16ri: {
	assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
	unsigned ShAmt = getTruncatedShiftCount(MI, 2);
	if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;

	if (DisableLEA16)
	return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
	: nullptr;
	NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
	.addOperand(Dest)
	.addReg(0)
	.addImm(1ULL << ShAmt)
	.addOperand(Src)
	.addImm(0)
	.addReg(0);
	break;
	}
	case X86::INC64r:
	case X86::INC32r: {
	assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
	unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
	: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
	bool isKill, isUndef;
	unsigned SrcReg;
	MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src, Opc, /AllowSP=/ false,
	SrcReg, isKill, isUndef, ImplicitOp, LV))
	return nullptr;

	MachineInstrBuilder MIB =
	BuildMI(MF, MI.getDebugLoc(), get(Opc))
	.addOperand(Dest)
	.addReg(SrcReg,
	getKillRegState(isKill) \| getUndefRegState(isUndef));
	if (ImplicitOp.getReg() != 0)
	MIB.addOperand(ImplicitOp);

	NewMI = addOffset(MIB, 1);
	break;
	}
	case X86::INC16r:
	if (DisableLEA16)
	return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
	: nullptr;
	assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
	NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
	.addOperand(Dest)
	.addOperand(Src),
	1);
	break;
	case X86::DEC64r:
	case X86::DEC32r: {
	assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
	unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
	: (is64Bit ? X86::LEA64_32r : X86::LEA32r);

	bool isKill, isUndef;
	unsigned SrcReg;
	MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src, Opc, /AllowSP=/ false,
	SrcReg, isKill, isUndef, ImplicitOp, LV))
	return nullptr;

	MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
	.addOperand(Dest)
	.addReg(SrcReg, getUndefRegState(isUndef) \|
	getKillRegState(isKill));
	if (ImplicitOp.getReg() != 0)
	MIB.addOperand(ImplicitOp);

	NewMI = addOffset(MIB, -1);

	break;
	}
	case X86::DEC16r:
	if (DisableLEA16)
	return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
	: nullptr;
	assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
	NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
	.addOperand(Dest)
	.addOperand(Src),
	-1);
	break;
	case X86::ADD64rr:
	case X86::ADD64rr_DB:
	case X86::ADD32rr:
	case X86::ADD32rr_DB: {
	assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
	unsigned Opc;
	if (MIOpc == X86::ADD64rr \|\| MIOpc == X86::ADD64rr_DB)
	Opc = X86::LEA64r;
	else
	Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;

	bool isKill, isUndef;
	unsigned SrcReg;
	MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src, Opc, /AllowSP=/ true,
	SrcReg, isKill, isUndef, ImplicitOp, LV))
	return nullptr;

	const MachineOperand &Src2 = MI.getOperand(2);
	bool isKill2, isUndef2;
	unsigned SrcReg2;
	MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src2, Opc, /AllowSP=/ false,
	SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
	return nullptr;

	MachineInstrBuilder MIB =
	BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
	if (ImplicitOp.getReg() != 0)
	MIB.addOperand(ImplicitOp);
	if (ImplicitOp2.getReg() != 0)
	MIB.addOperand(ImplicitOp2);

	NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);

	// Preserve undefness of the operands.
	NewMI->getOperand(1).setIsUndef(isUndef);
	NewMI->getOperand(3).setIsUndef(isUndef2);

	if (LV && Src2.isKill())
	LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
	break;
	}
	case X86::ADD16rr:
	case X86::ADD16rr_DB: {
	if (DisableLEA16)
	return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
	: nullptr;
	assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
	unsigned Src2 = MI.getOperand(2).getReg();
	bool isKill2 = MI.getOperand(2).isKill();
	NewMI = addRegReg(
	BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
	Src.getReg(), Src.isKill(), Src2, isKill2);

	// Preserve undefness of the operands.
	bool isUndef = MI.getOperand(1).isUndef();
	bool isUndef2 = MI.getOperand(2).isUndef();
	NewMI->getOperand(1).setIsUndef(isUndef);
	NewMI->getOperand(3).setIsUndef(isUndef2);

	if (LV && isKill2)
	LV->replaceKillInstruction(Src2, MI, *NewMI);
	break;
	}
	case X86::ADD64ri32:
	case X86::ADD64ri8:
	case X86::ADD64ri32_DB:
	case X86::ADD64ri8_DB:
	assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
	NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
	.addOperand(Dest)
	.addOperand(Src),
	MI.getOperand(2));
	break;
	case X86::ADD32ri:
	case X86::ADD32ri8:
	case X86::ADD32ri_DB:
	case X86::ADD32ri8_DB: {
	assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
	unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;

	bool isKill, isUndef;
	unsigned SrcReg;
	MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
	if (!classifyLEAReg(MI, Src, Opc, /AllowSP=/ true,
	SrcReg, isKill, isUndef, ImplicitOp, LV))
	return nullptr;

	MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
	.addOperand(Dest)
	.addReg(SrcReg, getUndefRegState(isUndef) \|
	getKillRegState(isKill));
	if (ImplicitOp.getReg() != 0)
	MIB.addOperand(ImplicitOp);

	NewMI = addOffset(MIB, MI.getOperand(2));
	break;
	}
	case X86::ADD16ri:
	case X86::ADD16ri8:
	case X86::ADD16ri_DB:
	case X86::ADD16ri8_DB:
	if (DisableLEA16)
	return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
	: nullptr;
	assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
	NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
	.addOperand(Dest)
	.addOperand(Src),
	MI.getOperand(2));
	break;
	}

	if (!NewMI) return nullptr;

	if (LV) { // Update live variables
	if (Src.isKill())
	LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
	if (Dest.isDead())
	LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
	}

	MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
	return NewMI;
	}

	/// This determines which of three possible cases of a three source commute
	/// the source indexes correspond to taking into account any mask operands.
	/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
	/// possible.
	/// Case 0 - Possible to commute the first and second operands.
	/// Case 1 - Possible to commute the first and third operands.
	/// Case 2 - Possible to commute the second and third operands.
	static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
	unsigned SrcOpIdx2) {
	// Put the lowest index to SrcOpIdx1 to simplify the checks below.
	if (SrcOpIdx1 > SrcOpIdx2)
	std::swap(SrcOpIdx1, SrcOpIdx2);

	unsigned Op1 = 1, Op2 = 2, Op3 = 3;
	if (X86II::isKMasked(TSFlags)) {
	// The k-mask operand cannot be commuted.
	if (SrcOpIdx1 == 2)
	return -1;

	// For k-zero-masked operations it is Ok to commute the first vector
	// operand.
	// For regular k-masked operations a conservative choice is done as the
	// elements of the first vector operand, for which the corresponding bit
	// in the k-mask operand is set to 0, are copied to the result of the
	// instruction.
	// TODO/FIXME: The commute still may be legal if it is known that the
	// k-mask operand is set to either all ones or all zeroes.
	// It is also Ok to commute the 1st operand if all users of MI use only
	// the elements enabled by the k-mask operand. For example,
	// v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
	// : v1[i];
	// VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
	// // Ok, to commute v1 in FMADD213PSZrk.
	if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
	return -1;
	Op2++;
	Op3++;
	}

	if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
	return 0;
	if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
	return 1;
	if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
	return 2;
	return -1;
	}

	unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
	const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
	const X86InstrFMA3Group &FMA3Group) const {

	unsigned Opc = MI.getOpcode();

	// Put the lowest index to SrcOpIdx1 to simplify the checks below.
	if (SrcOpIdx1 > SrcOpIdx2)
	std::swap(SrcOpIdx1, SrcOpIdx2);

	// TODO: Commuting the 1st operand of FMA*_Int requires some additional
	// analysis. The commute optimization is legal only if all users of FMA*_Int
	// use only the lowest element of the FMA*_Int instruction. Such analysis are
	// not implemented yet. So, just return 0 in that case.
	// When such analysis are available this place will be the right place for
	// calling it.
	if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
	return 0;

	// Determine which case this commute is or if it can't be done.
	int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
	if (Case < 0)
	return 0;

	// Define the FMA forms mapping array that helps to map input FMA form
	// to output FMA form to preserve the operation semantics after
	// commuting the operands.
	const unsigned Form132Index = 0;
	const unsigned Form213Index = 1;
	const unsigned Form231Index = 2;
	static const unsigned FormMapping[][3] = {
	// 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
	// FMA132 A, C, b; ==> FMA231 C, A, b;
	// FMA213 B, A, c; ==> FMA213 A, B, c;
	// FMA231 C, A, b; ==> FMA132 A, C, b;
	{ Form231Index, Form213Index, Form132Index },
	// 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
	// FMA132 A, c, B; ==> FMA132 B, c, A;
	// FMA213 B, a, C; ==> FMA231 C, a, B;
	// FMA231 C, a, B; ==> FMA213 B, a, C;
	{ Form132Index, Form231Index, Form213Index },
	// 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
	// FMA132 a, C, B; ==> FMA213 a, B, C;
	// FMA213 b, A, C; ==> FMA132 b, C, A;
	// FMA231 c, A, B; ==> FMA231 c, B, A;
	{ Form213Index, Form132Index, Form231Index }
	};

	unsigned FMAForms[3];
	if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
	FMAForms[0] = FMA3Group.getReg132Opcode();
	FMAForms[1] = FMA3Group.getReg213Opcode();
	FMAForms[2] = FMA3Group.getReg231Opcode();
	} else {
	FMAForms[0] = FMA3Group.getMem132Opcode();
	FMAForms[1] = FMA3Group.getMem213Opcode();
	FMAForms[2] = FMA3Group.getMem231Opcode();
	}
	unsigned FormIndex;
	for (FormIndex = 0; FormIndex < 3; FormIndex++)
	if (Opc == FMAForms[FormIndex])
	break;

	// Everything is ready, just adjust the FMA opcode and return it.
	FormIndex = FormMapping[Case][FormIndex];
	return FMAForms[FormIndex];
	}

	static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
	unsigned SrcOpIdx2) {
	uint64_t TSFlags = MI.getDesc().TSFlags;

	// Determine which case this commute is or if it can't be done.
	int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
	if (Case < 0)
	return false;

	// For each case we need to swap two pairs of bits in the final immediate.
	static const uint8_t SwapMasks[3][4] = {
	{ 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
	{ 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
	{ 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
	};

	uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
	// Clear out the bits we are swapping.
	uint8_t NewImm = Imm & ~(SwapMasks[Case][0] \| SwapMasks[Case][1] \|
	SwapMasks[Case][2] \| SwapMasks[Case][3]);
	// If the immediate had a bit of the pair set, then set the opposite bit.
	if (Imm & SwapMasks[Case][0]) NewImm \|= SwapMasks[Case][1];
	if (Imm & SwapMasks[Case][1]) NewImm \|= SwapMasks[Case][0];
	if (Imm & SwapMasks[Case][2]) NewImm \|= SwapMasks[Case][3];
	if (Imm & SwapMasks[Case][3]) NewImm \|= SwapMasks[Case][2];
	MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);

	return true;
	}

	// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
	// commuted.
	static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
	#define VPERM_CASES(Suffix) \
	case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
	case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
	case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
	case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
	case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
	case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
	case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
	case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
	case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
	case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
	case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
	case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:

	#define VPERM_CASES_BROADCAST(Suffix) \
	VPERM_CASES(Suffix) \
	case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
	case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
	case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
	case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
	case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
	case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:

	switch (Opcode) {
	default: return false;
	VPERM_CASES(B)
	VPERM_CASES_BROADCAST(D)
	VPERM_CASES_BROADCAST(PD)
	VPERM_CASES_BROADCAST(PS)
	VPERM_CASES_BROADCAST(Q)
	VPERM_CASES(W)
	return true;
	}
	#undef VPERM_CASES_BROADCAST
	#undef VPERM_CASES
	}

	// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
	// from the I opcod to the T opcode and vice versa.
	static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
	#define VPERM_CASES(Orig, New) \
	case X86::Orig##128rr: return X86::New##128rr; \
	case X86::Orig##128rrkz: return X86::New##128rrkz; \
	case X86::Orig##128rm: return X86::New##128rm; \
	case X86::Orig##128rmkz: return X86::New##128rmkz; \
	case X86::Orig##256rr: return X86::New##256rr; \
	case X86::Orig##256rrkz: return X86::New##256rrkz; \
	case X86::Orig##256rm: return X86::New##256rm; \
	case X86::Orig##256rmkz: return X86::New##256rmkz; \
	case X86::Orig##rr: return X86::New##rr; \
	case X86::Orig##rrkz: return X86::New##rrkz; \
	case X86::Orig##rm: return X86::New##rm; \
	case X86::Orig##rmkz: return X86::New##rmkz;

	#define VPERM_CASES_BROADCAST(Orig, New) \
	VPERM_CASES(Orig, New) \
	case X86::Orig##128rmb: return X86::New##128rmb; \
	case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
	case X86::Orig##256rmb: return X86::New##256rmb; \
	case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
	case X86::Orig##rmb: return X86::New##rmb; \
	case X86::Orig##rmbkz: return X86::New##rmbkz;

	switch (Opcode) {
	VPERM_CASES(VPERMI2B, VPERMT2B)
	VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
	VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
	VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
	VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
	VPERM_CASES(VPERMI2W, VPERMT2W)
	VPERM_CASES(VPERMT2B, VPERMI2B)
	VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
	VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
	VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
	VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
	VPERM_CASES(VPERMT2W, VPERMI2W)
	}

	llvm_unreachable("Unreachable!");
	#undef VPERM_CASES_BROADCAST
	#undef VPERM_CASES
	}

	MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned OpIdx1,
	unsigned OpIdx2) const {
	auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
	if (NewMI)
	return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
	return MI;
	};

	switch (MI.getOpcode()) {
	case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
	case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
	case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
	case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
	case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
	case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
	unsigned Opc;
	unsigned Size;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
	case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
	case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
	case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
	case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
	case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
	}
	unsigned Amt = MI.getOperand(3).getImm();
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	WorkingMI.getOperand(3).setImm(Size - Amt);
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::BLENDPDrri:
	case X86::BLENDPSrri:
	case X86::PBLENDWrri:
	case X86::VBLENDPDrri:
	case X86::VBLENDPSrri:
	case X86::VBLENDPDYrri:
	case X86::VBLENDPSYrri:
	case X86::VPBLENDDrri:
	case X86::VPBLENDWrri:
	case X86::VPBLENDDYrri:
	case X86::VPBLENDWYrri:{
	unsigned Mask;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::BLENDPDrri: Mask = 0x03; break;
	case X86::BLENDPSrri: Mask = 0x0F; break;
	case X86::PBLENDWrri: Mask = 0xFF; break;
	case X86::VBLENDPDrri: Mask = 0x03; break;
	case X86::VBLENDPSrri: Mask = 0x0F; break;
	case X86::VBLENDPDYrri: Mask = 0x0F; break;
	case X86::VBLENDPSYrri: Mask = 0xFF; break;
	case X86::VPBLENDDrri: Mask = 0x0F; break;
	case X86::VPBLENDWrri: Mask = 0xFF; break;
	case X86::VPBLENDDYrri: Mask = 0xFF; break;
	case X86::VPBLENDWYrri: Mask = 0xFF; break;
	}
	// Only the least significant bits of Imm are used.
	unsigned Imm = MI.getOperand(3).getImm() & Mask;
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.getOperand(3).setImm(Mask ^ Imm);
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::MOVSDrr:
	case X86::MOVSSrr:
	case X86::VMOVSDrr:
	case X86::VMOVSSrr:{
	// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
	if (!Subtarget.hasSSE41())
	return nullptr;

	unsigned Mask, Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
	case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
	case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
	case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
	}

	// MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
	// this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
	auto &MRI = MI.getParent()->getParent()->getRegInfo();
	auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
	unsigned VR128 = MRI.createVirtualRegister(VR128RC);
	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
	VR128)
	.addReg(MI.getOperand(2).getReg());

	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	WorkingMI.getOperand(2).setReg(VR128);
	WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::PCLMULQDQrr:
	case X86::VPCLMULQDQrr:{
	// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
	// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
	unsigned Imm = MI.getOperand(3).getImm();
	unsigned Src1Hi = Imm & 0x01;
	unsigned Src2Hi = Imm & 0x10;
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.getOperand(3).setImm((Src1Hi << 4) \| (Src2Hi >> 4));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::CMPSDrr:
	case X86::CMPSSrr:
	case X86::CMPPDrri:
	case X86::CMPPSrri:
	case X86::VCMPSDrr:
	case X86::VCMPSSrr:
	case X86::VCMPPDrri:
	case X86::VCMPPSrri:
	case X86::VCMPPDYrri:
	case X86::VCMPPSYrri:
	case X86::VCMPSDZrr:
	case X86::VCMPSSZrr:
	case X86::VCMPPDZrri:
	case X86::VCMPPSZrri:
	case X86::VCMPPDZ128rri:
	case X86::VCMPPSZ128rri:
	case X86::VCMPPDZ256rri:
	case X86::VCMPPSZ256rri: {
	// Float comparison can be safely commuted for
	// Ordered/Unordered/Equal/NotEqual tests
	unsigned Imm = MI.getOperand(3).getImm() & 0x7;
	switch (Imm) {
	case 0x00: // EQUAL
	case 0x03: // UNORDERED
	case 0x04: // NOT EQUAL
	case 0x07: // ORDERED
	return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
	default:
	return nullptr;
	}
	}
	case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
	case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
	case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
	case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
	case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
	case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
	case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
	case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
	case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
	case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
	case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
	case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
	// Flip comparison mode immediate (if necessary).
	unsigned Imm = MI.getOperand(3).getImm() & 0x7;
	switch (Imm) {
	default: llvm_unreachable("Unreachable!");
	case 0x01: Imm = 0x06; break; // LT -> NLE
	case 0x02: Imm = 0x05; break; // LE -> NLT
	case 0x05: Imm = 0x02; break; // NLT -> LE
	case 0x06: Imm = 0x01; break; // NLE -> LT
	case 0x00: // EQ
	case 0x03: // FALSE
	case 0x04: // NE
	case 0x07: // TRUE
	break;
	}
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.getOperand(3).setImm(Imm);
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::VPCOMBri: case X86::VPCOMUBri:
	case X86::VPCOMDri: case X86::VPCOMUDri:
	case X86::VPCOMQri: case X86::VPCOMUQri:
	case X86::VPCOMWri: case X86::VPCOMUWri: {
	// Flip comparison mode immediate (if necessary).
	unsigned Imm = MI.getOperand(3).getImm() & 0x7;
	switch (Imm) {
	default: llvm_unreachable("Unreachable!");
	case 0x00: Imm = 0x02; break; // LT -> GT
	case 0x01: Imm = 0x03; break; // LE -> GE
	case 0x02: Imm = 0x00; break; // GT -> LT
	case 0x03: Imm = 0x01; break; // GE -> LE
	case 0x04: // EQ
	case 0x05: // NE
	case 0x06: // FALSE
	case 0x07: // TRUE
	break;
	}
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.getOperand(3).setImm(Imm);
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::VPERM2F128rr:
	case X86::VPERM2I128rr: {
	// Flip permute source immediate.
	// Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
	// Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
	unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::MOVHLPSrr:
	case X86::UNPCKHPDrr: {
	if (!Subtarget.hasSSE2())
	return nullptr;

	unsigned Opc = MI.getOpcode();
	switch (Opc) {
	default: llvm_unreachable("Unreachable!");
	case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
	case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
	}
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
	case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
	case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
	case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
	case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
	case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
	case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
	case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
	case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
	case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
	case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
	case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
	case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
	case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
	case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
	case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
	case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
	case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
	case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
	case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
	case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
	case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
	case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
	case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
	case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
	case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
	case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
	case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
	case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
	case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
	case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
	case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
	case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
	case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
	case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
	case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
	case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
	case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
	case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
	case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
	case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
	case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
	case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
	case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
	case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
	case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
	case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
	case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
	case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
	case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
	case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
	case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
	case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
	case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
	case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
	case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
	case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
	case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
	case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
	case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
	case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
	case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
	case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
	}
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
	case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
	case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
	case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
	case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
	case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
	case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
	case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
	case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
	case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
	case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
	case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
	case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
	case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
	case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
	case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
	case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
	case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
	auto &WorkingMI = cloneIfNew(MI);
	if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
	return nullptr;
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}
	default: {
	if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
	unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}

	const X86InstrFMA3Group *FMA3Group =
	X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
	if (FMA3Group) {
	unsigned Opc =
	getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
	if (Opc == 0)
	return nullptr;
	auto &WorkingMI = cloneIfNew(MI);
	WorkingMI.setDesc(get(Opc));
	return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /NewMI=/false,
	OpIdx1, OpIdx2);
	}

	return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
	}
	}
	}

	bool X86InstrInfo::findFMA3CommutedOpIndices(
	const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
	const X86InstrFMA3Group &FMA3Group) const {

	if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
	return false;

	// Check if we can adjust the opcode to preserve the semantics when
	// commute the register operands.
	return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
	}

	bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
	unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const {
	uint64_t TSFlags = MI.getDesc().TSFlags;

	unsigned FirstCommutableVecOp = 1;
	unsigned LastCommutableVecOp = 3;
	unsigned KMaskOp = 0;
	if (X86II::isKMasked(TSFlags)) {
	// The k-mask operand has index = 2 for masked and zero-masked operations.
	KMaskOp = 2;

	// The operand with index = 1 is used as a source for those elements for
	// which the corresponding bit in the k-mask is set to 0.
	if (X86II::isKMergeMasked(TSFlags))
	FirstCommutableVecOp = 3;

	LastCommutableVecOp++;
	}

	if (isMem(MI, LastCommutableVecOp))
	LastCommutableVecOp--;

	// Only the first RegOpsNum operands are commutable.
	// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
	// that the operand is not specified/fixed.
	if (SrcOpIdx1 != CommuteAnyOperandIndex &&
	(SrcOpIdx1 < FirstCommutableVecOp \|\| SrcOpIdx1 > LastCommutableVecOp \|\|
	SrcOpIdx1 == KMaskOp))
	return false;
	if (SrcOpIdx2 != CommuteAnyOperandIndex &&
	(SrcOpIdx2 < FirstCommutableVecOp \|\| SrcOpIdx2 > LastCommutableVecOp \|\|
	SrcOpIdx2 == KMaskOp))
	return false;

	// Look for two different register operands assumed to be commutable
	// regardless of the FMA opcode. The FMA opcode is adjusted later.
	if (SrcOpIdx1 == CommuteAnyOperandIndex \|\|
	SrcOpIdx2 == CommuteAnyOperandIndex) {
	unsigned CommutableOpIdx1 = SrcOpIdx1;
	unsigned CommutableOpIdx2 = SrcOpIdx2;

	// At least one of operands to be commuted is not specified and
	// this method is free to choose appropriate commutable operands.
	if (SrcOpIdx1 == SrcOpIdx2)
	// Both of operands are not fixed. By default set one of commutable
	// operands to the last register operand of the instruction.
	CommutableOpIdx2 = LastCommutableVecOp;
	else if (SrcOpIdx2 == CommuteAnyOperandIndex)
	// Only one of operands is not fixed.
	CommutableOpIdx2 = SrcOpIdx1;

	// CommutableOpIdx2 is well defined now. Let's choose another commutable
	// operand and assign its index to CommutableOpIdx1.
	unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
	for (CommutableOpIdx1 = LastCommutableVecOp;
	CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
	// Just ignore and skip the k-mask operand.
	if (CommutableOpIdx1 == KMaskOp)
	continue;

	// The commuted operands must have different registers.
	// Otherwise, the commute transformation does not change anything and
	// is useless then.
	if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
	break;
	}

	// No appropriate commutable operands were found.
	if (CommutableOpIdx1 < FirstCommutableVecOp)
	return false;

	// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
	// to return those values.
	if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
	CommutableOpIdx1, CommutableOpIdx2))
	return false;
	}

	return true;
	}

	bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const {
	const MCInstrDesc &Desc = MI.getDesc();
	if (!Desc.isCommutable())
	return false;

	switch (MI.getOpcode()) {
	case X86::CMPSDrr:
	case X86::CMPSSrr:
	case X86::CMPPDrri:
	case X86::CMPPSrri:
	case X86::VCMPSDrr:
	case X86::VCMPSSrr:
	case X86::VCMPPDrri:
	case X86::VCMPPSrri:
	case X86::VCMPPDYrri:
	case X86::VCMPPSYrri:
	case X86::VCMPSDZrr:
	case X86::VCMPSSZrr:
	case X86::VCMPPDZrri:
	case X86::VCMPPSZrri:
	case X86::VCMPPDZ128rri:
	case X86::VCMPPSZ128rri:
	case X86::VCMPPDZ256rri:
	case X86::VCMPPSZ256rri: {
	// Float comparison can be safely commuted for
	// Ordered/Unordered/Equal/NotEqual tests
	unsigned Imm = MI.getOperand(3).getImm() & 0x7;
	switch (Imm) {
	case 0x00: // EQUAL
	case 0x03: // UNORDERED
	case 0x04: // NOT EQUAL
	case 0x07: // ORDERED
	// The indices of the commutable operands are 1 and 2.
	// Assign them to the returned operand indices here.
	return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
	}
	return false;
	}
	case X86::MOVSDrr:
	case X86::MOVSSrr:
	case X86::VMOVSDrr:
	case X86::VMOVSSrr: {
	if (Subtarget.hasSSE41())
	return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
	return false;
	}
	case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
	case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
	case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
	case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
	case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
	case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
	case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
	case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
	case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
	case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
	case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
	case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
	case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
	case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
	case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
	case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
	case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
	case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
	return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
	default:
	const X86InstrFMA3Group *FMA3Group =
	X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
	if (FMA3Group)
	return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);

	// Handled masked instructions since we need to skip over the mask input
	// and the preserved input.
	if (Desc.TSFlags & X86II::EVEX_K) {
	// First assume that the first input is the mask operand and skip past it.
	unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
	unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
	// Check if the first input is tied. If there isn't one then we only
	// need to skip the mask operand which we did above.
	if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
	MCOI::TIED_TO) != -1)) {
	// If this is zero masking instruction with a tied operand, we need to
	// move the first index back to the first input since this must
	// be a 3 input instruction and we want the first two non-mask inputs.
	// Otherwise this is a 2 input instruction with a preserved input and
	// mask, so we need to move the indices to skip one more input.
	if (Desc.TSFlags & X86II::EVEX_Z)
	--CommutableOpIdx1;
	else {
	++CommutableOpIdx1;
	++CommutableOpIdx2;
	}
	}

	if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
	CommutableOpIdx1, CommutableOpIdx2))
	return false;

	if (!MI.getOperand(SrcOpIdx1).isReg() \|\|
	!MI.getOperand(SrcOpIdx2).isReg())
	// No idea.
	return false;
	return true;
	}

	return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
	}
	return false;
	}

	static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
	switch (BrOpc) {
	default: return X86::COND_INVALID;
	case X86::JE_1: return X86::COND_E;
	case X86::JNE_1: return X86::COND_NE;
	case X86::JL_1: return X86::COND_L;
	case X86::JLE_1: return X86::COND_LE;
	case X86::JG_1: return X86::COND_G;
	case X86::JGE_1: return X86::COND_GE;
	case X86::JB_1: return X86::COND_B;
	case X86::JBE_1: return X86::COND_BE;
	case X86::JA_1: return X86::COND_A;
	case X86::JAE_1: return X86::COND_AE;
	case X86::JS_1: return X86::COND_S;
	case X86::JNS_1: return X86::COND_NS;
	case X86::JP_1: return X86::COND_P;
	case X86::JNP_1: return X86::COND_NP;
	case X86::JO_1: return X86::COND_O;
	case X86::JNO_1: return X86::COND_NO;
	}
	}

	/// Return condition code of a SET opcode.
	static X86::CondCode getCondFromSETOpc(unsigned Opc) {
	switch (Opc) {
	default: return X86::COND_INVALID;
	case X86::SETAr: case X86::SETAm: return X86::COND_A;
	case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
	case X86::SETBr: case X86::SETBm: return X86::COND_B;
	case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
	case X86::SETEr: case X86::SETEm: return X86::COND_E;
	case X86::SETGr: case X86::SETGm: return X86::COND_G;
	case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
	case X86::SETLr: case X86::SETLm: return X86::COND_L;
	case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
	case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
	case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
	case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
	case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
	case X86::SETOr: case X86::SETOm: return X86::COND_O;
	case X86::SETPr: case X86::SETPm: return X86::COND_P;
	case X86::SETSr: case X86::SETSm: return X86::COND_S;
	}
	}

	/// Return condition code of a CMov opcode.
	X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
	switch (Opc) {
	default: return X86::COND_INVALID;
	case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
	case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
	return X86::COND_A;
	case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
	case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
	return X86::COND_AE;
	case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
	case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
	return X86::COND_B;
	case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
	case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
	return X86::COND_BE;
	case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
	case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
	return X86::COND_E;
	case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
	case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
	return X86::COND_G;
	case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
	case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
	return X86::COND_GE;
	case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
	case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
	return X86::COND_L;
	case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
	case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
	return X86::COND_LE;
	case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
	case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
	return X86::COND_NE;
	case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
	case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
	return X86::COND_NO;
	case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
	case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
	return X86::COND_NP;
	case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
	case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
	return X86::COND_NS;
	case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
	case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
	return X86::COND_O;
	case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
	case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
	return X86::COND_P;
	case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
	case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
	return X86::COND_S;
	}
	}

	unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Illegal condition code!");
	case X86::COND_E: return X86::JE_1;
	case X86::COND_NE: return X86::JNE_1;
	case X86::COND_L: return X86::JL_1;
	case X86::COND_LE: return X86::JLE_1;
	case X86::COND_G: return X86::JG_1;
	case X86::COND_GE: return X86::JGE_1;
	case X86::COND_B: return X86::JB_1;
	case X86::COND_BE: return X86::JBE_1;
	case X86::COND_A: return X86::JA_1;
	case X86::COND_AE: return X86::JAE_1;
	case X86::COND_S: return X86::JS_1;
	case X86::COND_NS: return X86::JNS_1;
	case X86::COND_P: return X86::JP_1;
	case X86::COND_NP: return X86::JNP_1;
	case X86::COND_O: return X86::JO_1;
	case X86::COND_NO: return X86::JNO_1;
	}
	}

	/// Return the inverse of the specified condition,
	/// e.g. turning COND_E to COND_NE.
	X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
	switch (CC) {
	default: llvm_unreachable("Illegal condition code!");
	case X86::COND_E: return X86::COND_NE;
	case X86::COND_NE: return X86::COND_E;
	case X86::COND_L: return X86::COND_GE;
	case X86::COND_LE: return X86::COND_G;
	case X86::COND_G: return X86::COND_LE;
	case X86::COND_GE: return X86::COND_L;
	case X86::COND_B: return X86::COND_AE;
	case X86::COND_BE: return X86::COND_A;
	case X86::COND_A: return X86::COND_BE;
	case X86::COND_AE: return X86::COND_B;
	case X86::COND_S: return X86::COND_NS;
	case X86::COND_NS: return X86::COND_S;
	case X86::COND_P: return X86::COND_NP;
	case X86::COND_NP: return X86::COND_P;
	case X86::COND_O: return X86::COND_NO;
	case X86::COND_NO: return X86::COND_O;
	case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
	case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
	}
	}

	/// Assuming the flags are set by MI(a,b), return the condition code if we
	/// modify the instructions such that flags are set by MI(b,a).
	static X86::CondCode getSwappedCondition(X86::CondCode CC) {
	switch (CC) {
	default: return X86::COND_INVALID;
	case X86::COND_E: return X86::COND_E;
	case X86::COND_NE: return X86::COND_NE;
	case X86::COND_L: return X86::COND_G;
	case X86::COND_LE: return X86::COND_GE;
	case X86::COND_G: return X86::COND_L;
	case X86::COND_GE: return X86::COND_LE;
	case X86::COND_B: return X86::COND_A;
	case X86::COND_BE: return X86::COND_AE;
	case X86::COND_A: return X86::COND_B;
	case X86::COND_AE: return X86::COND_BE;
	}
	}

	/// Return a set opcode for the given condition and
	/// whether it has memory operand.
	unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
	static const uint16_t Opc[16][2] = {
	{ X86::SETAr, X86::SETAm },
	{ X86::SETAEr, X86::SETAEm },
	{ X86::SETBr, X86::SETBm },
	{ X86::SETBEr, X86::SETBEm },
	{ X86::SETEr, X86::SETEm },
	{ X86::SETGr, X86::SETGm },
	{ X86::SETGEr, X86::SETGEm },
	{ X86::SETLr, X86::SETLm },
	{ X86::SETLEr, X86::SETLEm },
	{ X86::SETNEr, X86::SETNEm },
	{ X86::SETNOr, X86::SETNOm },
	{ X86::SETNPr, X86::SETNPm },
	{ X86::SETNSr, X86::SETNSm },
	{ X86::SETOr, X86::SETOm },
	{ X86::SETPr, X86::SETPm },
	{ X86::SETSr, X86::SETSm }
	};

	assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
	return Opc[CC][HasMemoryOperand ? 1 : 0];
	}

	/// Return a cmov opcode for the given condition,
	/// register size in bytes, and operand type.
	unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
	bool HasMemoryOperand) {
	static const uint16_t Opc[32][3] = {
	{ X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
	{ X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
	{ X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
	{ X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
	{ X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
	{ X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
	{ X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
	{ X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
	{ X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
	{ X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
	{ X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
	{ X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
	{ X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
	{ X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
	{ X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
	{ X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
	{ X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
	{ X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
	{ X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
	{ X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
	{ X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
	{ X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
	{ X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
	{ X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
	{ X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
	{ X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
	{ X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
	{ X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
	{ X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
	{ X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
	{ X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
	{ X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
	};

	assert(CC < 16 && "Can only handle standard cond codes");
	unsigned Idx = HasMemoryOperand ? 16+CC : CC;
	switch(RegBytes) {
	default: llvm_unreachable("Illegal register size!");
	case 2: return Opc[Idx][0];
	case 4: return Opc[Idx][1];
	case 8: return Opc[Idx][2];
	}
	}

	bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
	if (!MI.isTerminator()) return false;

	// Conditional branch is a special case.
	if (MI.isBranch() && !MI.isBarrier())
	return true;
	if (!MI.isPredicable())
	return true;
	return !isPredicated(MI);
	}

	-bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
	- switch (MI.getOpcode()) {
	- case X86::TCRETURNdi:
	- case X86::TCRETURNri:
	- case X86::TCRETURNmi:
	- case X86::TCRETURNdi64:
	- case X86::TCRETURNri64:
	- case X86::TCRETURNmi64:
	- return true;
	- default:
	- return false;
	- }
	-}
	-
	-bool X86InstrInfo::canMakeTailCallConditional(
	- SmallVectorImpl<MachineOperand> &BranchCond,
	- const MachineInstr &TailCall) const {
	- if (TailCall.getOpcode() != X86::TCRETURNdi &&
	- TailCall.getOpcode() != X86::TCRETURNdi64) {
	- // Only direct calls can be done with a conditional branch.
	- return false;
	- }
	-
	- if (Subtarget.isTargetWin64()) {
	- // Conditional tail calls confuse the Win64 unwinder.
	- // TODO: Allow them for "leaf" functions; PR30337.
	- return false;
	- }
	-
	- assert(BranchCond.size() == 1);
	- if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
	- // Can't make a conditional tail call with this condition.
	- return false;
	- }
	-
	- const X86MachineFunctionInfo *X86FI =
	- TailCall.getParent()->getParent()->getInfo<X86MachineFunctionInfo>();
	- if (X86FI->getTCReturnAddrDelta() != 0 \|\|
	- TailCall.getOperand(1).getImm() != 0) {
	- // A conditional tail call cannot do any stack adjustment.
	- return false;
	- }
	-
	- return true;
	-}
	-
	-void X86InstrInfo::replaceBranchWithTailCall(
	- MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
	- const MachineInstr &TailCall) const {
	- assert(canMakeTailCallConditional(BranchCond, TailCall));
	-
	- MachineBasicBlock::iterator I = MBB.end();
	- while (I != MBB.begin()) {
	- --I;
	- if (I->isDebugValue())
	- continue;
	- if (!I->isBranch())
	- assert(0 && "Can't find the branch to replace!");
	-
	- X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
	- assert(BranchCond.size() == 1);
	- if (CC != BranchCond[0].getImm())
	- continue;
	-
	- break;
	- }
	-
	- unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
	- : X86::TCRETURNdi64cc;
	-
	- auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
	- MIB->addOperand(TailCall.getOperand(0)); // Destination.
	- MIB.addImm(0); // Stack offset (not used).
	- MIB->addOperand(BranchCond[0]); // Condition.
	- MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
	-
	- I->eraseFromParent();
	-}
	-
	// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
	// not be a fallthrough MBB now due to layout changes). Return nullptr if the
	// fallthrough MBB cannot be identified.
	static MachineBasicBlock getFallThroughMBB(MachineBasicBlock MBB,
	MachineBasicBlock *TBB) {
	// Look for non-EHPad successors other than TBB. If we find exactly one, it
	// is the fallthrough MBB. If we find zero, then TBB is both the target MBB
	// and fallthrough MBB. If we find more than one, we cannot identify the
	// fallthrough MBB and should return nullptr.
	MachineBasicBlock *FallthroughBB = nullptr;
	for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
	if ((SI)->isEHPad() \|\| (SI == TBB && FallthroughBB))
	continue;
	// Return a nullptr if we found more than one fallthrough successor.
	if (FallthroughBB && FallthroughBB != TBB)
	return nullptr;
	FallthroughBB = *SI;
	}
	return FallthroughBB;
	}

	bool X86InstrInfo::AnalyzeBranchImpl(
	MachineBasicBlock &MBB, MachineBasicBlock &TBB, MachineBasicBlock &FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {

	// Start from the bottom of the block and work up, examining the
	// terminator instructions.
	MachineBasicBlock::iterator I = MBB.end();
	MachineBasicBlock::iterator UnCondBrIter = MBB.end();
	while (I != MBB.begin()) {
	--I;
	if (I->isDebugValue())
	continue;

	// Working from the bottom, when we see a non-terminator instruction, we're
	// done.
	if (!isUnpredicatedTerminator(*I))
	break;

	// A terminator that isn't a branch can't easily be handled by this
	// analysis.
	if (!I->isBranch())
	return true;

	// Handle unconditional branches.
	if (I->getOpcode() == X86::JMP_1) {
	UnCondBrIter = I;

	if (!AllowModify) {
	TBB = I->getOperand(0).getMBB();
	continue;
	}

	// If the block has any instructions after a JMP, delete them.
	while (std::next(I) != MBB.end())
	std::next(I)->eraseFromParent();

	Cond.clear();
	FBB = nullptr;

	// Delete the JMP if it's equivalent to a fall-through.
	if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
	TBB = nullptr;
	I->eraseFromParent();
	I = MBB.end();
	UnCondBrIter = MBB.end();
	continue;
	}

	// TBB is used to indicate the unconditional destination.
	TBB = I->getOperand(0).getMBB();
	continue;
	}

	// Handle conditional branches.
	X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
	if (BranchCode == X86::COND_INVALID)
	return true; // Can't handle indirect branch.

	// Working from the bottom, handle the first conditional branch.
	if (Cond.empty()) {
	MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
	if (AllowModify && UnCondBrIter != MBB.end() &&
	MBB.isLayoutSuccessor(TargetBB)) {
	// If we can modify the code and it ends in something like:
	//
	// jCC L1
	// jmp L2
	// L1:
	// ...
	// L2:
	//
	// Then we can change this to:
	//
	// jnCC L2
	// L1:
	// ...
	// L2:
	//
	// Which is a bit more efficient.
	// We conditionally jump to the fall-through block.
	BranchCode = GetOppositeBranchCondition(BranchCode);
	unsigned JNCC = GetCondBranchFromCond(BranchCode);
	MachineBasicBlock::iterator OldInst = I;

	BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
	.addMBB(UnCondBrIter->getOperand(0).getMBB());
	BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
	.addMBB(TargetBB);

	OldInst->eraseFromParent();
	UnCondBrIter->eraseFromParent();

	// Restart the analysis.
	UnCondBrIter = MBB.end();
	I = MBB.end();
	continue;
	}

	FBB = TBB;
	TBB = I->getOperand(0).getMBB();
	Cond.push_back(MachineOperand::CreateImm(BranchCode));
	CondBranches.push_back(&*I);
	continue;
	}

	// Handle subsequent conditional branches. Only handle the case where all
	// conditional branches branch to the same destination and their condition
	// opcodes fit one of the special multi-branch idioms.
	assert(Cond.size() == 1);
	assert(TBB);

	// If the conditions are the same, we can leave them alone.
	X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
	auto NewTBB = I->getOperand(0).getMBB();
	if (OldBranchCode == BranchCode && TBB == NewTBB)
	continue;

	// If they differ, see if they fit one of the known patterns. Theoretically,
	// we could handle more patterns here, but we shouldn't expect to see them
	// if instruction selection has done a reasonable job.
	if (TBB == NewTBB &&
	((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) \|\|
	(OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
	BranchCode = X86::COND_NE_OR_P;
	} else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) \|\|
	(OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
	if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
	return true;

	// X86::COND_E_AND_NP usually has two different branch destinations.
	//
	// JP B1
	// JE B2
	// JMP B1
	// B1:
	// B2:
	//
	// Here this condition branches to B2 only if NP && E. It has another
	// equivalent form:
	//
	// JNE B1
	// JNP B2
	// JMP B1
	// B1:
	// B2:
	//
	// Similarly it branches to B2 only if E && NP. That is why this condition
	// is named with COND_E_AND_NP.
	BranchCode = X86::COND_E_AND_NP;
	} else
	return true;

	// Update the MachineOperand.
	Cond[0].setImm(BranchCode);
	CondBranches.push_back(&*I);
	}

	return false;
	}

	bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const {
	SmallVector<MachineInstr *, 4> CondBranches;
	return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
	}

	bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
	MachineBranchPredicate &MBP,
	bool AllowModify) const {
	using namespace std::placeholders;

	SmallVector<MachineOperand, 4> Cond;
	SmallVector<MachineInstr *, 4> CondBranches;
	if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
	AllowModify))
	return true;

	if (Cond.size() != 1)
	return true;

	assert(MBP.TrueDest && "expected!");

	if (!MBP.FalseDest)
	MBP.FalseDest = MBB.getNextNode();

	const TargetRegisterInfo *TRI = &getRegisterInfo();

	MachineInstr *ConditionDef = nullptr;
	bool SingleUseCondition = true;

	for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
	if (I->modifiesRegister(X86::EFLAGS, TRI)) {
	ConditionDef = &*I;
	break;
	}

	if (I->readsRegister(X86::EFLAGS, TRI))
	SingleUseCondition = false;
	}

	if (!ConditionDef)
	return true;

	if (SingleUseCondition) {
	for (auto *Succ : MBB.successors())
	if (Succ->isLiveIn(X86::EFLAGS))
	SingleUseCondition = false;
	}

	MBP.ConditionDef = ConditionDef;
	MBP.SingleUseCondition = SingleUseCondition;

	// Currently we only recognize the simple pattern:
	//
	// test %reg, %reg
	// je %label
	//
	const unsigned TestOpcode =
	Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;

	if (ConditionDef->getOpcode() == TestOpcode &&
	ConditionDef->getNumOperands() == 3 &&
	ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
	(Cond[0].getImm() == X86::COND_NE \|\| Cond[0].getImm() == X86::COND_E)) {
	MBP.LHS = ConditionDef->getOperand(0);
	MBP.RHS = MachineOperand::CreateImm(0);
	MBP.Predicate = Cond[0].getImm() == X86::COND_NE
	? MachineBranchPredicate::PRED_NE
	: MachineBranchPredicate::PRED_EQ;
	return false;
	}

	return true;
	}

	unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved) const {
	assert(!BytesRemoved && "code size not handled");

	MachineBasicBlock::iterator I = MBB.end();
	unsigned Count = 0;

	while (I != MBB.begin()) {
	--I;
	if (I->isDebugValue())
	continue;
	if (I->getOpcode() != X86::JMP_1 &&
	getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
	break;
	// Remove the branch.
	I->eraseFromParent();
	I = MBB.end();
	++Count;
	}

	return Count;
	}

	unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded) const {
	// Shouldn't be a fall through.
	assert(TBB && "insertBranch must not be told to insert a fallthrough");
	assert((Cond.size() == 1 \|\| Cond.size() == 0) &&
	"X86 branch conditions have one component!");
	assert(!BytesAdded && "code size not handled");

	if (Cond.empty()) {
	// Unconditional branch?
	assert(!FBB && "Unconditional branch with multiple successors!");
	BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
	return 1;
	}

	// If FBB is null, it is implied to be a fall-through block.
	bool FallThru = FBB == nullptr;

	// Conditional branch.
	unsigned Count = 0;
	X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
	switch (CC) {
	case X86::COND_NE_OR_P:
	// Synthesize NE_OR_P with two branches.
	BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
	++Count;
	BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
	++Count;
	break;
	case X86::COND_E_AND_NP:
	// Use the next block of MBB as FBB if it is null.
	if (FBB == nullptr) {
	FBB = getFallThroughMBB(&MBB, TBB);
	assert(FBB && "MBB cannot be the last block in function when the false "
	"body is a fall-through.");
	}
	// Synthesize COND_E_AND_NP with two branches.
	BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
	++Count;
	BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
	++Count;
	break;
	default: {
	unsigned Opc = GetCondBranchFromCond(CC);
	BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
	++Count;
	}
	}
	if (!FallThru) {
	// Two-way Conditional branch. Insert the second branch.
	BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
	++Count;
	}
	return Count;
	}

	bool X86InstrInfo::
	canInsertSelect(const MachineBasicBlock &MBB,
	ArrayRef<MachineOperand> Cond,
	unsigned TrueReg, unsigned FalseReg,
	int &CondCycles, int &TrueCycles, int &FalseCycles) const {
	// Not all subtargets have cmov instructions.
	if (!Subtarget.hasCMov())
	return false;
	if (Cond.size() != 1)
	return false;
	// We cannot do the composite conditions, at least not in SSA form.
	if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
	return false;

	// Check register classes.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	const TargetRegisterClass *RC =
	RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
	if (!RC)
	return false;

	// We have cmov instructions for 16, 32, and 64 bit general purpose registers.
	if (X86::GR16RegClass.hasSubClassEq(RC) \|\|
	X86::GR32RegClass.hasSubClassEq(RC) \|\|
	X86::GR64RegClass.hasSubClassEq(RC)) {
	// This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
	// Bridge. Probably Ivy Bridge as well.
	CondCycles = 2;
	TrueCycles = 2;
	FalseCycles = 2;
	return true;
	}

	// Can't do vectors.
	return false;
	}

	void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, unsigned DstReg,
	ArrayRef<MachineOperand> Cond, unsigned TrueReg,
	unsigned FalseReg) const {
	MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	assert(Cond.size() == 1 && "Invalid Cond array");
	unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
	MRI.getRegClass(DstReg)->getSize(),
	false /HasMemoryOperand/);
	BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
	}

	/// Test if the given register is a physical h register.
	static bool isHReg(unsigned Reg) {
	return X86::GR8_ABCD_HRegClass.contains(Reg);
	}

	// Try and copy between VR128/VR64 and GR64 registers.
	static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
	const X86Subtarget &Subtarget) {
	bool HasAVX = Subtarget.hasAVX();
	bool HasAVX512 = Subtarget.hasAVX512();

	// SrcReg(MaskReg) -> DestReg(GR64)
	// SrcReg(MaskReg) -> DestReg(GR32)
	// SrcReg(MaskReg) -> DestReg(GR16)
	// SrcReg(MaskReg) -> DestReg(GR8)

	// All KMASK RegClasses hold the same k registers, can be tested against anyone.
	if (X86::VK16RegClass.contains(SrcReg)) {
	if (X86::GR64RegClass.contains(DestReg)) {
	assert(Subtarget.hasBWI());
	return X86::KMOVQrk;
	}
	if (X86::GR32RegClass.contains(DestReg))
	return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
	if (X86::GR16RegClass.contains(DestReg)) {
	DestReg = getX86SubSuperRegister(DestReg, 32);
	return X86::KMOVWrk;
	}
	if (X86::GR8RegClass.contains(DestReg)) {
	DestReg = getX86SubSuperRegister(DestReg, 32);
	return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
	}
	}

	// SrcReg(GR64) -> DestReg(MaskReg)
	// SrcReg(GR32) -> DestReg(MaskReg)
	// SrcReg(GR16) -> DestReg(MaskReg)
	// SrcReg(GR8) -> DestReg(MaskReg)

	// All KMASK RegClasses hold the same k registers, can be tested against anyone.
	if (X86::VK16RegClass.contains(DestReg)) {
	if (X86::GR64RegClass.contains(SrcReg)) {
	assert(Subtarget.hasBWI());
	return X86::KMOVQkr;
	}
	if (X86::GR32RegClass.contains(SrcReg))
	return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
	if (X86::GR16RegClass.contains(SrcReg)) {
	SrcReg = getX86SubSuperRegister(SrcReg, 32);
	return X86::KMOVWkr;
	}
	if (X86::GR8RegClass.contains(SrcReg)) {
	SrcReg = getX86SubSuperRegister(SrcReg, 32);
	return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
	}
	}


	// SrcReg(VR128) -> DestReg(GR64)
	// SrcReg(VR64) -> DestReg(GR64)
	// SrcReg(GR64) -> DestReg(VR128)
	// SrcReg(GR64) -> DestReg(VR64)

	if (X86::GR64RegClass.contains(DestReg)) {
	if (X86::VR128XRegClass.contains(SrcReg))
	// Copy from a VR128 register to a GR64 register.
	return HasAVX512 ? X86::VMOVPQIto64Zrr :
	HasAVX ? X86::VMOVPQIto64rr :
	X86::MOVPQIto64rr;
	if (X86::VR64RegClass.contains(SrcReg))
	// Copy from a VR64 register to a GR64 register.
	return X86::MMX_MOVD64from64rr;
	} else if (X86::GR64RegClass.contains(SrcReg)) {
	// Copy from a GR64 register to a VR128 register.
	if (X86::VR128XRegClass.contains(DestReg))
	return HasAVX512 ? X86::VMOV64toPQIZrr :
	HasAVX ? X86::VMOV64toPQIrr :
	X86::MOV64toPQIrr;
	// Copy from a GR64 register to a VR64 register.
	if (X86::VR64RegClass.contains(DestReg))
	return X86::MMX_MOVD64to64rr;
	}

	// SrcReg(FR32) -> DestReg(GR32)
	// SrcReg(GR32) -> DestReg(FR32)

	if (X86::GR32RegClass.contains(DestReg) &&
	X86::FR32XRegClass.contains(SrcReg))
	// Copy from a FR32 register to a GR32 register.
	return HasAVX512 ? X86::VMOVSS2DIZrr :
	HasAVX ? X86::VMOVSS2DIrr :
	X86::MOVSS2DIrr;

	if (X86::FR32XRegClass.contains(DestReg) &&
	X86::GR32RegClass.contains(SrcReg))
	// Copy from a GR32 register to a FR32 register.
	return HasAVX512 ? X86::VMOVDI2SSZrr :
	HasAVX ? X86::VMOVDI2SSrr :
	X86::MOVDI2SSrr;
	return 0;
	}

	void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DestReg,
	unsigned SrcReg, bool KillSrc) const {
	// First deal with the normal symmetric copies.
	bool HasAVX = Subtarget.hasAVX();
	bool HasVLX = Subtarget.hasVLX();
	unsigned Opc = 0;
	if (X86::GR64RegClass.contains(DestReg, SrcReg))
	Opc = X86::MOV64rr;
	else if (X86::GR32RegClass.contains(DestReg, SrcReg))
	Opc = X86::MOV32rr;
	else if (X86::GR16RegClass.contains(DestReg, SrcReg))
	Opc = X86::MOV16rr;
	else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
	// Copying to or from a physical H register on x86-64 requires a NOREX
	// move. Otherwise use a normal move.
	if ((isHReg(DestReg) \|\| isHReg(SrcReg)) &&
	Subtarget.is64Bit()) {
	Opc = X86::MOV8rr_NOREX;
	// Both operands must be encodable without an REX prefix.
	assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
	"8-bit H register can not be copied outside GR8_NOREX");
	} else
	Opc = X86::MOV8rr;
	}
	else if (X86::VR64RegClass.contains(DestReg, SrcReg))
	Opc = X86::MMX_MOVQ64rr;
	else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
	if (HasVLX)
	Opc = X86::VMOVAPSZ128rr;
	else if (X86::VR128RegClass.contains(DestReg, SrcReg))
	Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
	else {
	// If this an extended register and we don't have VLX we need to use a
	// 512-bit move.
	Opc = X86::VMOVAPSZrr;
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
	&X86::VR512RegClass);
	SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
	&X86::VR512RegClass);
	}
	} else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
	if (HasVLX)
	Opc = X86::VMOVAPSZ256rr;
	else if (X86::VR256RegClass.contains(DestReg, SrcReg))
	Opc = X86::VMOVAPSYrr;
	else {
	// If this an extended register and we don't have VLX we need to use a
	// 512-bit move.
	Opc = X86::VMOVAPSZrr;
	const TargetRegisterInfo *TRI = &getRegisterInfo();
	DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
	&X86::VR512RegClass);
	SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
	&X86::VR512RegClass);
	}
	} else if (X86::VR512RegClass.contains(DestReg, SrcReg))
	Opc = X86::VMOVAPSZrr;
	// All KMASK RegClasses hold the same k registers, can be tested against anyone.
	else if (X86::VK16RegClass.contains(DestReg, SrcReg))
	Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
	if (!Opc)
	Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);

	if (Opc) {
	BuildMI(MBB, MI, DL, get(Opc), DestReg)
	.addReg(SrcReg, getKillRegState(KillSrc));
	return;
	}

	bool FromEFLAGS = SrcReg == X86::EFLAGS;
	bool ToEFLAGS = DestReg == X86::EFLAGS;
	int Reg = FromEFLAGS ? DestReg : SrcReg;
	bool is32 = X86::GR32RegClass.contains(Reg);
	bool is64 = X86::GR64RegClass.contains(Reg);

	if ((FromEFLAGS \|\| ToEFLAGS) && (is32 \|\| is64)) {
	int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
	int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
	int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
	int Pop = is64 ? X86::POP64r : X86::POP32r;
	int PopF = is64 ? X86::POPF64 : X86::POPF32;
	int AX = is64 ? X86::RAX : X86::EAX;

	if (!Subtarget.hasLAHFSAHF()) {
	assert(Subtarget.is64Bit() &&
	"Not having LAHF/SAHF only happens on 64-bit.");
	// Moving EFLAGS to / from another register requires a push and a pop.
	// Notice that we have to adjust the stack if we don't want to clobber the
	// first frame index. See X86FrameLowering.cpp - usesTheStack.
	if (FromEFLAGS) {
	BuildMI(MBB, MI, DL, get(PushF));
	BuildMI(MBB, MI, DL, get(Pop), DestReg);
	}
	if (ToEFLAGS) {
	BuildMI(MBB, MI, DL, get(Push))
	.addReg(SrcReg, getKillRegState(KillSrc));
	BuildMI(MBB, MI, DL, get(PopF));
	}
	return;
	}

	// The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
	// inefficient. Instead:
	// - Save the overflow flag OF into AL using SETO, and restore it using a
	// signed 8-bit addition of AL and INT8_MAX.
	// - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
	// using LAHF/SAHF.
	// - When RAX/EAX is live and isn't the destination register, make sure it
	// isn't clobbered by PUSH/POP'ing it before and after saving/restoring
	// the flags.
	// This approach is ~2.25x faster than using PUSHF/POPF.
	//
	// This is still somewhat inefficient because we don't know which flags are
	// actually live inside EFLAGS. Were we able to do a single SETcc instead of
	// SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
	//
	// PUSHF/POPF is also potentially incorrect because it affects other flags
	// such as TF/IF/DF, which LLVM doesn't model.
	//
	// Notice that we have to adjust the stack if we don't want to clobber the
	// first frame index.
	// See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.

	const TargetRegisterInfo *TRI = &getRegisterInfo();
	MachineBasicBlock::LivenessQueryResult LQR =
	MBB.computeRegisterLiveness(TRI, AX, MI);
	// We do not want to save and restore AX if we do not have to.
	// Moreover, if we do so whereas AX is dead, we would need to set
	// an undef flag on the use of AX, otherwise the verifier will
	// complain that we read an undef value.
	// We do not want to change the behavior of the machine verifier
	// as this is usually wrong to read an undef value.
	if (MachineBasicBlock::LQR_Unknown == LQR) {
	LivePhysRegs LPR(TRI);
	LPR.addLiveOuts(MBB);
	MachineBasicBlock::iterator I = MBB.end();
	while (I != MI) {
	--I;
	LPR.stepBackward(*I);
	}
	// AX contains the top most register in the aliasing hierarchy.
	// It may not be live, but one of its aliases may be.
	for (MCRegAliasIterator AI(AX, TRI, true);
	AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
	LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
	: MachineBasicBlock::LQR_Dead;
	}
	bool AXDead = (Reg == AX) \|\| (MachineBasicBlock::LQR_Dead == LQR);
	if (!AXDead)
	BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
	if (FromEFLAGS) {
	BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
	BuildMI(MBB, MI, DL, get(X86::LAHF));
	BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
	}
	if (ToEFLAGS) {
	BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
	BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
	.addReg(X86::AL)
	.addImm(INT8_MAX);
	BuildMI(MBB, MI, DL, get(X86::SAHF));
	}
	if (!AXDead)
	BuildMI(MBB, MI, DL, get(Pop), AX);
	return;
	}

	DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
	<< " to " << RI.getName(DestReg) << '\n');
	llvm_unreachable("Cannot emit physreg copy instruction");
	}

	static unsigned getLoadStoreRegOpcode(unsigned Reg,
	const TargetRegisterClass *RC,
	bool isStackAligned,
	const X86Subtarget &STI,
	bool load) {
	bool HasAVX = STI.hasAVX();
	bool HasAVX512 = STI.hasAVX512();
	bool HasVLX = STI.hasVLX();

	switch (RC->getSize()) {
	default:
	llvm_unreachable("Unknown spill size");
	case 1:
	assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
	if (STI.is64Bit())
	// Copying to or from a physical H register on x86-64 requires a NOREX
	// move. Otherwise use a normal move.
	if (isHReg(Reg) \|\| X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
	return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
	return load ? X86::MOV8rm : X86::MOV8mr;
	case 2:
	if (X86::VK16RegClass.hasSubClassEq(RC))
	return load ? X86::KMOVWkm : X86::KMOVWmk;
	assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
	return load ? X86::MOV16rm : X86::MOV16mr;
	case 4:
	if (X86::GR32RegClass.hasSubClassEq(RC))
	return load ? X86::MOV32rm : X86::MOV32mr;
	if (X86::FR32XRegClass.hasSubClassEq(RC))
	return load ?
	(HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
	(HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
	if (X86::RFP32RegClass.hasSubClassEq(RC))
	return load ? X86::LD_Fp32m : X86::ST_Fp32m;
	if (X86::VK32RegClass.hasSubClassEq(RC))
	return load ? X86::KMOVDkm : X86::KMOVDmk;
	llvm_unreachable("Unknown 4-byte regclass");
	case 8:
	if (X86::GR64RegClass.hasSubClassEq(RC))
	return load ? X86::MOV64rm : X86::MOV64mr;
	if (X86::FR64XRegClass.hasSubClassEq(RC))
	return load ?
	(HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
	(HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
	if (X86::VR64RegClass.hasSubClassEq(RC))
	return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
	if (X86::RFP64RegClass.hasSubClassEq(RC))
	return load ? X86::LD_Fp64m : X86::ST_Fp64m;
	if (X86::VK64RegClass.hasSubClassEq(RC))
	return load ? X86::KMOVQkm : X86::KMOVQmk;
	llvm_unreachable("Unknown 8-byte regclass");
	case 10:
	assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
	return load ? X86::LD_Fp80m : X86::ST_FpP80m;
	case 16: {
	assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
	// If stack is realigned we can use aligned stores.
	if (isStackAligned)
	return load ?
	(HasVLX ? X86::VMOVAPSZ128rm :
	HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
	HasAVX ? X86::VMOVAPSrm :
	X86::MOVAPSrm):
	(HasVLX ? X86::VMOVAPSZ128mr :
	HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
	HasAVX ? X86::VMOVAPSmr :
	X86::MOVAPSmr);
	else
	return load ?
	(HasVLX ? X86::VMOVUPSZ128rm :
	HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
	HasAVX ? X86::VMOVUPSrm :
	X86::MOVUPSrm):
	(HasVLX ? X86::VMOVUPSZ128mr :
	HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
	HasAVX ? X86::VMOVUPSmr :
	X86::MOVUPSmr);
	}
	case 32:
	assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
	// If stack is realigned we can use aligned stores.
	if (isStackAligned)
	return load ?
	(HasVLX ? X86::VMOVAPSZ256rm :
	HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
	X86::VMOVAPSYrm) :
	(HasVLX ? X86::VMOVAPSZ256mr :
	HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
	X86::VMOVAPSYmr);
	else
	return load ?
	(HasVLX ? X86::VMOVUPSZ256rm :
	HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
	X86::VMOVUPSYrm) :
	(HasVLX ? X86::VMOVUPSZ256mr :
	HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
	X86::VMOVUPSYmr);
	case 64:
	assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
	assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
	if (isStackAligned)
	return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
	else
	return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
	}
	}

	bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const {
	const MCInstrDesc &Desc = MemOp.getDesc();
	int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
	if (MemRefBegin < 0)
	return false;

	MemRefBegin += X86II::getOperandBias(Desc);

	MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
	if (!BaseMO.isReg()) // Can be an MO_FrameIndex
	return false;

	BaseReg = BaseMO.getReg();
	if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
	return false;

	if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
	X86::NoRegister)
	return false;

	const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);

	// Displacement can be symbolic
	if (!DispMO.isImm())
	return false;

	Offset = DispMO.getImm();

	return true;
	}

	static unsigned getStoreRegOpcode(unsigned SrcReg,
	const TargetRegisterClass *RC,
	bool isStackAligned,
	const X86Subtarget &STI) {
	return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
	}


	static unsigned getLoadRegOpcode(unsigned DestReg,
	const TargetRegisterClass *RC,
	bool isStackAligned,
	const X86Subtarget &STI) {
	return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
	}

	void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill, int FrameIdx,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	const MachineFunction &MF = *MBB.getParent();
	assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
	"Stack slot too small for store");
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned =
	(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) \|\|
	RI.canRealignStack(MF);
	unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
	DebugLoc DL = MBB.findDebugLoc(MI);
	addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
	.addReg(SrcReg, getKillRegState(isKill));
	}

	void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
	bool isKill,
	SmallVectorImpl<MachineOperand> &Addr,
	const TargetRegisterClass *RC,
	MachineInstr::mmo_iterator MMOBegin,
	MachineInstr::mmo_iterator MMOEnd,
	SmallVectorImpl<MachineInstr*> &NewMIs) const {
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned = MMOBegin != MMOEnd &&
	(*MMOBegin)->getAlignment() >= Alignment;
	unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
	DebugLoc DL;
	MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
	for (unsigned i = 0, e = Addr.size(); i != e; ++i)
	MIB.addOperand(Addr[i]);
	MIB.addReg(SrcReg, getKillRegState(isKill));
	(*MIB).setMemRefs(MMOBegin, MMOEnd);
	NewMIs.push_back(MIB);
	}


	void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIdx,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const {
	const MachineFunction &MF = *MBB.getParent();
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned =
	(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) \|\|
	RI.canRealignStack(MF);
	unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
	DebugLoc DL = MBB.findDebugLoc(MI);
	addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
	}

	void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
	SmallVectorImpl<MachineOperand> &Addr,
	const TargetRegisterClass *RC,
	MachineInstr::mmo_iterator MMOBegin,
	MachineInstr::mmo_iterator MMOEnd,
	SmallVectorImpl<MachineInstr*> &NewMIs) const {
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned = MMOBegin != MMOEnd &&
	(*MMOBegin)->getAlignment() >= Alignment;
	unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
	DebugLoc DL;
	MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
	for (unsigned i = 0, e = Addr.size(); i != e; ++i)
	MIB.addOperand(Addr[i]);
	(*MIB).setMemRefs(MMOBegin, MMOEnd);
	NewMIs.push_back(MIB);
	}

	bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &CmpMask,
	int &CmpValue) const {
	switch (MI.getOpcode()) {
	default: break;
	case X86::CMP64ri32:
	case X86::CMP64ri8:
	case X86::CMP32ri:
	case X86::CMP32ri8:
	case X86::CMP16ri:
	case X86::CMP16ri8:
	case X86::CMP8ri:
	if (!MI.getOperand(1).isImm())
	return false;
	SrcReg = MI.getOperand(0).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	CmpValue = MI.getOperand(1).getImm();
	return true;
	// A SUB can be used to perform comparison.
	case X86::SUB64rm:
	case X86::SUB32rm:
	case X86::SUB16rm:
	case X86::SUB8rm:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	case X86::SUB64rr:
	case X86::SUB32rr:
	case X86::SUB16rr:
	case X86::SUB8rr:
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = MI.getOperand(2).getReg();
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	case X86::SUB64ri32:
	case X86::SUB64ri8:
	case X86::SUB32ri:
	case X86::SUB32ri8:
	case X86::SUB16ri:
	case X86::SUB16ri8:
	case X86::SUB8ri:
	if (!MI.getOperand(2).isImm())
	return false;
	SrcReg = MI.getOperand(1).getReg();
	SrcReg2 = 0;
	CmpMask = ~0;
	CmpValue = MI.getOperand(2).getImm();
	return true;
	case X86::CMP64rr:
	case X86::CMP32rr:
	case X86::CMP16rr:
	case X86::CMP8rr:
	SrcReg = MI.getOperand(0).getReg();
	SrcReg2 = MI.getOperand(1).getReg();
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	case X86::TEST8rr:
	case X86::TEST16rr:
	case X86::TEST32rr:
	case X86::TEST64rr:
	SrcReg = MI.getOperand(0).getReg();
	if (MI.getOperand(1).getReg() != SrcReg)
	return false;
	// Compare against zero.
	SrcReg2 = 0;
	CmpMask = ~0;
	CmpValue = 0;
	return true;
	}
	return false;
	}

	/// Check whether the first instruction, whose only
	/// purpose is to update flags, can be made redundant.
	/// CMPrr can be made redundant by SUBrr if the operands are the same.
	/// This function can be extended later on.
	/// SrcReg, SrcRegs: register operands for FlagI.
	/// ImmValue: immediate for FlagI if it takes an immediate.
	inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
	unsigned SrcReg2, int ImmValue,
	MachineInstr &OI) {
	if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) \|\|
	(FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) \|\|
	(FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) \|\|
	(FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
	((OI.getOperand(1).getReg() == SrcReg &&
	OI.getOperand(2).getReg() == SrcReg2) \|\|
	(OI.getOperand(1).getReg() == SrcReg2 &&
	OI.getOperand(2).getReg() == SrcReg)))
	return true;

	if (((FlagI.getOpcode() == X86::CMP64ri32 &&
	OI.getOpcode() == X86::SUB64ri32) \|\|
	(FlagI.getOpcode() == X86::CMP64ri8 &&
	OI.getOpcode() == X86::SUB64ri8) \|\|
	(FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) \|\|
	(FlagI.getOpcode() == X86::CMP32ri8 &&
	OI.getOpcode() == X86::SUB32ri8) \|\|
	(FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) \|\|
	(FlagI.getOpcode() == X86::CMP16ri8 &&
	OI.getOpcode() == X86::SUB16ri8) \|\|
	(FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
	OI.getOperand(1).getReg() == SrcReg &&
	OI.getOperand(2).getImm() == ImmValue)
	return true;
	return false;
	}

	/// Check whether the definition can be converted
	/// to remove a comparison against zero.
	inline static bool isDefConvertible(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default: return false;

	// The shift instructions only modify ZF if their shift count is non-zero.
	// N.B.: The processor truncates the shift count depending on the encoding.
	case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
	case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
	return getTruncatedShiftCount(MI, 2) != 0;

	// Some left shift instructions can be turned into LEA instructions but only
	// if their flags aren't used. Avoid transforming such instructions.
	case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
	unsigned ShAmt = getTruncatedShiftCount(MI, 2);
	if (isTruncatedShiftCountForLEA(ShAmt)) return false;
	return ShAmt != 0;
	}

	case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
	case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
	return getTruncatedShiftCount(MI, 3) != 0;

	case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
	case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
	case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
	case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
	case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
	case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
	case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
	case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
	case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
	case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
	case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
	case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
	case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
	case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
	case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
	case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
	case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
	case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
	case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
	case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
	case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
	case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
	case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
	case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
	case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
	case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
	case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
	case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
	case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
	case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
	case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
	case X86::ADC32ri: case X86::ADC32ri8:
	case X86::ADC32rr: case X86::ADC64ri32:
	case X86::ADC64ri8: case X86::ADC64rr:
	case X86::SBB32ri: case X86::SBB32ri8:
	case X86::SBB32rr: case X86::SBB64ri32:
	case X86::SBB64ri8: case X86::SBB64rr:
	case X86::ANDN32rr: case X86::ANDN32rm:
	case X86::ANDN64rr: case X86::ANDN64rm:
	case X86::BEXTR32rr: case X86::BEXTR64rr:
	case X86::BEXTR32rm: case X86::BEXTR64rm:
	case X86::BLSI32rr: case X86::BLSI32rm:
	case X86::BLSI64rr: case X86::BLSI64rm:
	case X86::BLSMSK32rr:case X86::BLSMSK32rm:
	case X86::BLSMSK64rr:case X86::BLSMSK64rm:
	case X86::BLSR32rr: case X86::BLSR32rm:
	case X86::BLSR64rr: case X86::BLSR64rm:
	case X86::BZHI32rr: case X86::BZHI32rm:
	case X86::BZHI64rr: case X86::BZHI64rm:
	case X86::LZCNT16rr: case X86::LZCNT16rm:
	case X86::LZCNT32rr: case X86::LZCNT32rm:
	case X86::LZCNT64rr: case X86::LZCNT64rm:
	case X86::POPCNT16rr:case X86::POPCNT16rm:
	case X86::POPCNT32rr:case X86::POPCNT32rm:
	case X86::POPCNT64rr:case X86::POPCNT64rm:
	case X86::TZCNT16rr: case X86::TZCNT16rm:
	case X86::TZCNT32rr: case X86::TZCNT32rm:
	case X86::TZCNT64rr: case X86::TZCNT64rm:
	return true;
	}
	}

	/// Check whether the use can be converted to remove a comparison against zero.
	static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	default: return X86::COND_INVALID;
	case X86::LZCNT16rr: case X86::LZCNT16rm:
	case X86::LZCNT32rr: case X86::LZCNT32rm:
	case X86::LZCNT64rr: case X86::LZCNT64rm:
	return X86::COND_B;
	case X86::POPCNT16rr:case X86::POPCNT16rm:
	case X86::POPCNT32rr:case X86::POPCNT32rm:
	case X86::POPCNT64rr:case X86::POPCNT64rm:
	return X86::COND_E;
	case X86::TZCNT16rr: case X86::TZCNT16rm:
	case X86::TZCNT32rr: case X86::TZCNT32rm:
	case X86::TZCNT64rr: case X86::TZCNT64rm:
	return X86::COND_B;
	}
	}

	/// Check if there exists an earlier instruction that
	/// operates on the same source operands and sets flags in the same way as
	/// Compare; remove Compare if possible.
	bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
	unsigned SrcReg2, int CmpMask,
	int CmpValue,
	const MachineRegisterInfo *MRI) const {
	// Check whether we can replace SUB with CMP.
	unsigned NewOpcode = 0;
	switch (CmpInstr.getOpcode()) {
	default: break;
	case X86::SUB64ri32:
	case X86::SUB64ri8:
	case X86::SUB32ri:
	case X86::SUB32ri8:
	case X86::SUB16ri:
	case X86::SUB16ri8:
	case X86::SUB8ri:
	case X86::SUB64rm:
	case X86::SUB32rm:
	case X86::SUB16rm:
	case X86::SUB8rm:
	case X86::SUB64rr:
	case X86::SUB32rr:
	case X86::SUB16rr:
	case X86::SUB8rr: {
	if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
	return false;
	// There is no use of the destination register, we can replace SUB with CMP.
	switch (CmpInstr.getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
	case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
	case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
	case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
	case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
	case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
	case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
	case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
	case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
	case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
	case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
	case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
	case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
	case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
	case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
	}
	CmpInstr.setDesc(get(NewOpcode));
	CmpInstr.RemoveOperand(0);
	// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
	if (NewOpcode == X86::CMP64rm \|\| NewOpcode == X86::CMP32rm \|\|
	NewOpcode == X86::CMP16rm \|\| NewOpcode == X86::CMP8rm)
	return false;
	}
	}

	// Get the unique definition of SrcReg.
	MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
	if (!MI) return false;

	// CmpInstr is the first instruction of the BB.
	MachineBasicBlock::iterator I = CmpInstr, Def = MI;

	// If we are comparing against zero, check whether we can use MI to update
	// EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
	bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
	if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
	return false;

	// If we have a use of the source register between the def and our compare
	// instruction we can eliminate the compare iff the use sets EFLAGS in the
	// right way.
	bool ShouldUpdateCC = false;
	X86::CondCode NewCC = X86::COND_INVALID;
	if (IsCmpZero && !isDefConvertible(*MI)) {
	// Scan forward from the use until we hit the use we're looking for or the
	// compare instruction.
	for (MachineBasicBlock::iterator J = MI;; ++J) {
	// Do we have a convertible instruction?
	NewCC = isUseDefConvertible(*J);
	if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
	J->getOperand(1).getReg() == SrcReg) {
	assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
	ShouldUpdateCC = true; // Update CC later on.
	// This is not a def of SrcReg, but still a def of EFLAGS. Keep going
	// with the new def.
	Def = J;
	MI = &*Def;
	break;
	}

	if (J == I)
	return false;
	}
	}

	// We are searching for an earlier instruction that can make CmpInstr
	// redundant and that instruction will be saved in Sub.
	MachineInstr *Sub = nullptr;
	const TargetRegisterInfo *TRI = &getRegisterInfo();

	// We iterate backward, starting from the instruction before CmpInstr and
	// stop when reaching the definition of a source register or done with the BB.
	// RI points to the instruction before CmpInstr.
	// If the definition is in this basic block, RE points to the definition;
	// otherwise, RE is the rend of the basic block.
	MachineBasicBlock::reverse_iterator
	RI = ++I.getReverse(),
	RE = CmpInstr.getParent() == MI->getParent()
	? Def.getReverse() /* points to MI */
	: CmpInstr.getParent()->rend();
	MachineInstr *Movr0Inst = nullptr;
	for (; RI != RE; ++RI) {
	MachineInstr &Instr = *RI;
	// Check whether CmpInstr can be made redundant by the current instruction.
	if (!IsCmpZero &&
	isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
	Sub = &Instr;
	break;
	}

	if (Instr.modifiesRegister(X86::EFLAGS, TRI) \|\|
	Instr.readsRegister(X86::EFLAGS, TRI)) {
	// This instruction modifies or uses EFLAGS.

	// MOV32r0 etc. are implemented with xor which clobbers condition code.
	// They are safe to move up, if the definition to EFLAGS is dead and
	// earlier instructions do not read or write EFLAGS.
	if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
	Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
	Movr0Inst = &Instr;
	continue;
	}

	// We can't remove CmpInstr.
	return false;
	}
	}

	// Return false if no candidates exist.
	if (!IsCmpZero && !Sub)
	return false;

	bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
	Sub->getOperand(2).getReg() == SrcReg);

	// Scan forward from the instruction after CmpInstr for uses of EFLAGS.
	// It is safe to remove CmpInstr if EFLAGS is redefined or killed.
	// If we are done with the basic block, we need to check whether EFLAGS is
	// live-out.
	bool IsSafe = false;
	SmallVector<std::pair<MachineInstr, unsigned /NewOpc*/>, 4> OpsToUpdate;
	MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
	for (++I; I != E; ++I) {
	const MachineInstr &Instr = *I;
	bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
	bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
	// We should check the usage if this instruction uses and updates EFLAGS.
	if (!UseEFLAGS && ModifyEFLAGS) {
	// It is safe to remove CmpInstr if EFLAGS is updated again.
	IsSafe = true;
	break;
	}
	if (!UseEFLAGS && !ModifyEFLAGS)
	continue;

	// EFLAGS is used by this instruction.
	X86::CondCode OldCC = X86::COND_INVALID;
	bool OpcIsSET = false;
	if (IsCmpZero \|\| IsSwapped) {
	// We decode the condition code from opcode.
	if (Instr.isBranch())
	OldCC = getCondFromBranchOpc(Instr.getOpcode());
	else {
	OldCC = getCondFromSETOpc(Instr.getOpcode());
	if (OldCC != X86::COND_INVALID)
	OpcIsSET = true;
	else
	OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
	}
	if (OldCC == X86::COND_INVALID) return false;
	}
	if (IsCmpZero) {
	switch (OldCC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO:
	// CF and OF are used, we can't perform this optimization.
	return false;
	}

	// If we're updating the condition code check if we have to reverse the
	// condition.
	if (ShouldUpdateCC)
	switch (OldCC) {
	default:
	return false;
	case X86::COND_E:
	break;
	case X86::COND_NE:
	NewCC = GetOppositeBranchCondition(NewCC);
	break;
	}
	} else if (IsSwapped) {
	// If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
	// to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
	// We swap the condition code and synthesize the new opcode.
	NewCC = getSwappedCondition(OldCC);
	if (NewCC == X86::COND_INVALID) return false;
	}

	if ((ShouldUpdateCC \|\| IsSwapped) && NewCC != OldCC) {
	// Synthesize the new opcode.
	bool HasMemoryOperand = Instr.hasOneMemOperand();
	unsigned NewOpc;
	if (Instr.isBranch())
	NewOpc = GetCondBranchFromCond(NewCC);
	else if(OpcIsSET)
	NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
	else {
	unsigned DstReg = Instr.getOperand(0).getReg();
	NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
	HasMemoryOperand);
	}

	// Push the MachineInstr to OpsToUpdate.
	// If it is safe to remove CmpInstr, the condition code of these
	// instructions will be modified.
	OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
	}
	if (ModifyEFLAGS \|\| Instr.killsRegister(X86::EFLAGS, TRI)) {
	// It is safe to remove CmpInstr if EFLAGS is updated again or killed.
	IsSafe = true;
	break;
	}
	}

	// If EFLAGS is not killed nor re-defined, we should check whether it is
	// live-out. If it is live-out, do not optimize.
	if ((IsCmpZero \|\| IsSwapped) && !IsSafe) {
	MachineBasicBlock *MBB = CmpInstr.getParent();
	for (MachineBasicBlock *Successor : MBB->successors())
	if (Successor->isLiveIn(X86::EFLAGS))
	return false;
	}

	// The instruction to be updated is either Sub or MI.
	Sub = IsCmpZero ? MI : Sub;
	// Move Movr0Inst to the appropriate place before Sub.
	if (Movr0Inst) {
	// Look backwards until we find a def that doesn't use the current EFLAGS.
	Def = Sub;
	MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
	InsertE = Sub->getParent()->rend();
	for (; InsertI != InsertE; ++InsertI) {
	MachineInstr Instr = &InsertI;
	if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
	Instr->modifiesRegister(X86::EFLAGS, TRI)) {
	Sub->getParent()->remove(Movr0Inst);
	Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
	Movr0Inst);
	break;
	}
	}
	if (InsertI == InsertE)
	return false;
	}

	// Make sure Sub instruction defines EFLAGS and mark the def live.
	unsigned i = 0, e = Sub->getNumOperands();
	for (; i != e; ++i) {
	MachineOperand &MO = Sub->getOperand(i);
	if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
	MO.setIsDead(false);
	break;
	}
	}
	assert(i != e && "Unable to locate a def EFLAGS operand");

	CmpInstr.eraseFromParent();

	// Modify the condition code of instructions in OpsToUpdate.
	for (auto &Op : OpsToUpdate)
	Op.first->setDesc(get(Op.second));
	return true;
	}

	/// Try to remove the load by folding it to a register
	/// operand at the use. We fold the load instructions if load defines a virtual
	/// register, the virtual register is used once in the same BB, and the
	/// instructions in-between do not load or store, and have no side effects.
	MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
	const MachineRegisterInfo *MRI,
	unsigned &FoldAsLoadDefReg,
	MachineInstr *&DefMI) const {
	// Check whether we can move DefMI here.
	DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
	assert(DefMI);
	bool SawStore = false;
	if (!DefMI->isSafeToMove(nullptr, SawStore))
	return nullptr;

	// Collect information about virtual register operands of MI.
	SmallVector<unsigned, 1> SrcOperandIds;
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MI.getOperand(i);
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (Reg != FoldAsLoadDefReg)
	continue;
	// Do not fold if we have a subreg use or a def.
	if (MO.getSubReg() \|\| MO.isDef())
	return nullptr;
	SrcOperandIds.push_back(i);
	}
	if (SrcOperandIds.empty())
	return nullptr;

	// Check whether we can fold the def into SrcOperandId.
	if (MachineInstr FoldMI = foldMemoryOperand(MI, SrcOperandIds, DefMI)) {
	FoldAsLoadDefReg = 0;
	return FoldMI;
	}

	return nullptr;
	}

	/// Expand a single-def pseudo instruction to a two-addr
	/// instruction with two undef reads of the register being defined.
	/// This is used for mapping:
	/// %xmm4 = V_SET0
	/// to:
	/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
	///
	static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
	const MCInstrDesc &Desc) {
	assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
	unsigned Reg = MIB->getOperand(0).getReg();
	MIB->setDesc(Desc);

	// MachineInstr::addOperand() will insert explicit operands before any
	// implicit operands.
	MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
	// But we don't trust that.
	assert(MIB->getOperand(1).getReg() == Reg &&
	MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
	return true;
	}

	/// Expand a single-def pseudo instruction to a two-addr
	/// instruction with two %k0 reads.
	/// This is used for mapping:
	/// %k4 = K_SET1
	/// to:
	/// %k4 = KXNORrr %k0, %k0
	static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
	const MCInstrDesc &Desc, unsigned Reg) {
	assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
	MIB->setDesc(Desc);
	MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
	return true;
	}

	static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
	bool MinusOne) {
	MachineBasicBlock &MBB = *MIB->getParent();
	DebugLoc DL = MIB->getDebugLoc();
	unsigned Reg = MIB->getOperand(0).getReg();

	// Insert the XOR.
	BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
	.addReg(Reg, RegState::Undef)
	.addReg(Reg, RegState::Undef);

	// Turn the pseudo into an INC or DEC.
	MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
	MIB.addReg(Reg);

	return true;
	}

	static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
	const TargetInstrInfo &TII,
	const X86Subtarget &Subtarget) {
	MachineBasicBlock &MBB = *MIB->getParent();
	DebugLoc DL = MIB->getDebugLoc();
	int64_t Imm = MIB->getOperand(1).getImm();
	assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
	MachineBasicBlock::iterator I = MIB.getInstr();

	int StackAdjustment;

	if (Subtarget.is64Bit()) {
	assert(MIB->getOpcode() == X86::MOV64ImmSExti8 \|\|
	MIB->getOpcode() == X86::MOV32ImmSExti8);

	// Can't use push/pop lowering if the function might write to the red zone.
	X86MachineFunctionInfo *X86FI =
	MBB.getParent()->getInfo<X86MachineFunctionInfo>();
	if (X86FI->getUsesRedZone()) {
	MIB->setDesc(TII.get(MIB->getOpcode() ==
	X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
	return true;
	}

	// 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
	// widen the register if necessary.
	StackAdjustment = 8;
	BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
	MIB->setDesc(TII.get(X86::POP64r));
	MIB->getOperand(0)
	.setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
	} else {
	assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
	StackAdjustment = 4;
	BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
	MIB->setDesc(TII.get(X86::POP32r));
	}

	// Build CFI if necessary.
	MachineFunction &MF = *MBB.getParent();
	const X86FrameLowering *TFL = Subtarget.getFrameLowering();
	bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
	bool NeedsDwarfCFI =
	!IsWin64Prologue &&
	(MF.getMMI().hasDebugInfo() \|\| MF.getFunction()->needsUnwindTableEntry());
	bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
	if (EmitCFI) {
	TFL->BuildCFI(MBB, I, DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
	TFL->BuildCFI(MBB, std::next(I), DL,
	MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
	}

	return true;
	}

	// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
	// code sequence is needed for other targets.
	static void expandLoadStackGuard(MachineInstrBuilder &MIB,
	const TargetInstrInfo &TII) {
	MachineBasicBlock &MBB = *MIB->getParent();
	DebugLoc DL = MIB->getDebugLoc();
	unsigned Reg = MIB->getOperand(0).getReg();
	const GlobalValue *GV =
	cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
	auto Flags = MachineMemOperand::MOLoad \|
	MachineMemOperand::MODereferenceable \|
	MachineMemOperand::MOInvariant;
	MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
	MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
	MachineBasicBlock::iterator I = MIB.getInstr();

	BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
	.addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
	.addMemOperand(MMO);
	MIB->setDebugLoc(DL);
	MIB->setDesc(TII.get(X86::MOV64rm));
	MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
	}

	// This is used to handle spills for 128/256-bit registers when we have AVX512,
	// but not VLX. If it uses an extended register we need to use an instruction
	// that loads the lower 128/256-bit, but is available with only AVX512F.
	static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
	const TargetRegisterInfo *TRI,
	const MCInstrDesc &LoadDesc,
	const MCInstrDesc &BroadcastDesc,
	unsigned SubIdx) {
	unsigned DestReg = MIB->getOperand(0).getReg();
	// Check if DestReg is XMM16-31 or YMM16-31.
	if (TRI->getEncodingValue(DestReg) < 16) {
	// We can use a normal VEX encoded load.
	MIB->setDesc(LoadDesc);
	} else {
	// Use a 128/256-bit VBROADCAST instruction.
	MIB->setDesc(BroadcastDesc);
	// Change the destination to a 512-bit register.
	DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
	MIB->getOperand(0).setReg(DestReg);
	}
	return true;
	}

	// This is used to handle spills for 128/256-bit registers when we have AVX512,
	// but not VLX. If it uses an extended register we need to use an instruction
	// that stores the lower 128/256-bit, but is available with only AVX512F.
	static bool expandNOVLXStore(MachineInstrBuilder &MIB,
	const TargetRegisterInfo *TRI,
	const MCInstrDesc &StoreDesc,
	const MCInstrDesc &ExtractDesc,
	unsigned SubIdx) {
	unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
	// Check if DestReg is XMM16-31 or YMM16-31.
	if (TRI->getEncodingValue(SrcReg) < 16) {
	// We can use a normal VEX encoded store.
	MIB->setDesc(StoreDesc);
	} else {
	// Use a VEXTRACTF instruction.
	MIB->setDesc(ExtractDesc);
	// Change the destination to a 512-bit register.
	SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
	MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
	MIB.addImm(0x0); // Append immediate to extract from the lower bits.
	}

	return true;
	}
	bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
	bool HasAVX = Subtarget.hasAVX();
	MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
	switch (MI.getOpcode()) {
	case X86::MOV32r0:
	return Expand2AddrUndef(MIB, get(X86::XOR32rr));
	case X86::MOV32r1:
	return expandMOV32r1(MIB, this, /MinusOne=*/ false);
	case X86::MOV32r_1:
	return expandMOV32r1(MIB, this, /MinusOne=*/ true);
	case X86::MOV32ImmSExti8:
	case X86::MOV64ImmSExti8:
	return ExpandMOVImmSExti8(MIB, *this, Subtarget);
	case X86::SETB_C8r:
	return Expand2AddrUndef(MIB, get(X86::SBB8rr));
	case X86::SETB_C16r:
	return Expand2AddrUndef(MIB, get(X86::SBB16rr));
	case X86::SETB_C32r:
	return Expand2AddrUndef(MIB, get(X86::SBB32rr));
	case X86::SETB_C64r:
	return Expand2AddrUndef(MIB, get(X86::SBB64rr));
	case X86::V_SET0:
	case X86::FsFLD0SS:
	case X86::FsFLD0SD:
	return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
	case X86::AVX_SET0:
	assert(HasAVX && "AVX not supported");
	return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
	case X86::AVX512_128_SET0:
	return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
	case X86::AVX512_256_SET0:
	return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
	case X86::AVX512_512_SET0:
	return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
	case X86::AVX512_FsFLD0SS:
	case X86::AVX512_FsFLD0SD:
	return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
	case X86::V_SETALLONES:
	return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
	case X86::AVX2_SETALLONES:
	return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
	case X86::AVX512_512_SETALLONES: {
	unsigned Reg = MIB->getOperand(0).getReg();
	MIB->setDesc(get(X86::VPTERNLOGDZrri));
	// VPTERNLOGD needs 3 register inputs and an immediate.
	// 0xff will return 1s for any input.
	MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
	.addReg(Reg, RegState::Undef).addImm(0xff);
	return true;
	}
	case X86::AVX512_512_SEXT_MASK_32:
	case X86::AVX512_512_SEXT_MASK_64: {
	unsigned Reg = MIB->getOperand(0).getReg();
	unsigned MaskReg = MIB->getOperand(1).getReg();
	unsigned MaskState = getRegState(MIB->getOperand(1));
	unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
	X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
	MI.RemoveOperand(1);
	MIB->setDesc(get(Opc));
	// VPTERNLOG needs 3 register inputs and an immediate.
	// 0xff will return 1s for any input.
	MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
	.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
	return true;
	}
	case X86::VMOVAPSZ128rm_NOVLX:
	return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
	get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
	case X86::VMOVUPSZ128rm_NOVLX:
	return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
	get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
	case X86::VMOVAPSZ256rm_NOVLX:
	return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
	get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
	case X86::VMOVUPSZ256rm_NOVLX:
	return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
	get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
	case X86::VMOVAPSZ128mr_NOVLX:
	return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
	get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
	case X86::VMOVUPSZ128mr_NOVLX:
	return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
	get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
	case X86::VMOVAPSZ256mr_NOVLX:
	return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
	get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
	case X86::VMOVUPSZ256mr_NOVLX:
	return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
	get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
	case X86::TEST8ri_NOREX:
	MI.setDesc(get(X86::TEST8ri));
	return true;
	case X86::MOV32ri64:
	MI.setDesc(get(X86::MOV32ri));
	return true;

	// KNL does not recognize dependency-breaking idioms for mask registers,
	// so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
	// Using %k0 as the undef input register is a performance heuristic based
	// on the assumption that %k0 is used less frequently than the other mask
	// registers, since it is not usable as a write mask.
	// FIXME: A more advanced approach would be to choose the best input mask
	// register based on context.
	case X86::KSET0B:
	case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
	case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
	case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
	case X86::KSET1B:
	case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
	case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
	case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
	case TargetOpcode::LOAD_STACK_GUARD:
	expandLoadStackGuard(MIB, *this);
	return true;
	}
	return false;
	}

	static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
	int PtrOffset = 0) {
	unsigned NumAddrOps = MOs.size();

	if (NumAddrOps < 4) {
	// FrameIndex only - add an immediate offset (whether its zero or not).
	for (unsigned i = 0; i != NumAddrOps; ++i)
	MIB.addOperand(MOs[i]);
	addOffset(MIB, PtrOffset);
	} else {
	// General Memory Addressing - we need to add any offset to an existing
	// offset.
	assert(MOs.size() == 5 && "Unexpected memory operand list length");
	for (unsigned i = 0; i != NumAddrOps; ++i) {
	const MachineOperand &MO = MOs[i];
	if (i == 3 && PtrOffset != 0) {
	MIB.addDisp(MO, PtrOffset);
	} else {
	MIB.addOperand(MO);
	}
	}
	}
	}

	static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
	ArrayRef<MachineOperand> MOs,
	MachineBasicBlock::iterator InsertPt,
	MachineInstr &MI,
	const TargetInstrInfo &TII) {
	// Create the base instruction with the memory operand as the first part.
	// Omit the implicit operands, something BuildMI can't do.
	MachineInstr *NewMI =
	MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
	MachineInstrBuilder MIB(MF, NewMI);
	addOperands(MIB, MOs);

	// Loop over the rest of the ri operands, converting them over.
	unsigned NumOps = MI.getDesc().getNumOperands() - 2;
	for (unsigned i = 0; i != NumOps; ++i) {
	MachineOperand &MO = MI.getOperand(i + 2);
	MIB.addOperand(MO);
	}
	for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MI.getOperand(i);
	MIB.addOperand(MO);
	}

	MachineBasicBlock *MBB = InsertPt->getParent();
	MBB->insert(InsertPt, NewMI);

	return MIB;
	}

	static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
	unsigned OpNo, ArrayRef<MachineOperand> MOs,
	MachineBasicBlock::iterator InsertPt,
	MachineInstr &MI, const TargetInstrInfo &TII,
	int PtrOffset = 0) {
	// Omit the implicit operands, something BuildMI can't do.
	MachineInstr *NewMI =
	MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
	MachineInstrBuilder MIB(MF, NewMI);

	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	MachineOperand &MO = MI.getOperand(i);
	if (i == OpNo) {
	assert(MO.isReg() && "Expected to fold into reg operand!");
	addOperands(MIB, MOs, PtrOffset);
	} else {
	MIB.addOperand(MO);
	}
	}

	MachineBasicBlock *MBB = InsertPt->getParent();
	MBB->insert(InsertPt, NewMI);

	return MIB;
	}

	static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
	ArrayRef<MachineOperand> MOs,
	MachineBasicBlock::iterator InsertPt,
	MachineInstr &MI) {
	MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
	MI.getDebugLoc(), TII.get(Opcode));
	addOperands(MIB, MOs);
	return MIB.addImm(0);
	}

	MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
	MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
	ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
	unsigned Size, unsigned Align) const {
	switch (MI.getOpcode()) {
	case X86::INSERTPSrr:
	case X86::VINSERTPSrr:
	case X86::VINSERTPSZrr:
	// Attempt to convert the load of inserted vector into a fold load
	// of a single float.
	if (OpNum == 2) {
	unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
	unsigned ZMask = Imm & 15;
	unsigned DstIdx = (Imm >> 4) & 3;
	unsigned SrcIdx = (Imm >> 6) & 3;

	unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
	if (Size <= RCSize && 4 <= Align) {
	int PtrOffset = SrcIdx * 4;
	unsigned NewImm = (DstIdx << 4) \| ZMask;
	unsigned NewOpCode =
	(MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
	(MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
	X86::INSERTPSrm;
	MachineInstr *NewMI =
	FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
	NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
	return NewMI;
	}
	}
	break;
	case X86::MOVHLPSrr:
	case X86::VMOVHLPSrr:
	case X86::VMOVHLPSZrr:
	// Move the upper 64-bits of the second operand to the lower 64-bits.
	// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
	// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
	if (OpNum == 2) {
	unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
	if (Size <= RCSize && 8 <= Align) {
	unsigned NewOpCode =
	(MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
	(MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
	X86::MOVLPSrm;
	MachineInstr *NewMI =
	FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
	return NewMI;
	}
	}
	break;
	};

	return nullptr;
	}

	MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
	ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
	unsigned Size, unsigned Align, bool AllowCommute) const {
	const DenseMap<unsigned,
	std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
	bool isCallRegIndirect = Subtarget.callRegIndirect();
	bool isTwoAddrFold = false;

	// For CPUs that favor the register form of a call or push,
	// do not fold loads into calls or pushes, unless optimizing for size
	// aggressively.
	if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
	(MI.getOpcode() == X86::CALL32r \|\| MI.getOpcode() == X86::CALL64r \|\|
	MI.getOpcode() == X86::PUSH16r \|\| MI.getOpcode() == X86::PUSH32r \|\|
	MI.getOpcode() == X86::PUSH64r))
	return nullptr;

	unsigned NumOps = MI.getDesc().getNumOperands();
	bool isTwoAddr =
	NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;

	// FIXME: AsmPrinter doesn't know how to handle
	// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
	if (MI.getOpcode() == X86::ADD32ri &&
	MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
	return nullptr;

	MachineInstr *NewMI = nullptr;

	// Attempt to fold any custom cases we have.
	if (MachineInstr *CustomMI =
	foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
	return CustomMI;

	// Folding a memory location into the two-address part of a two-address
	// instruction is different than folding it other places. It requires
	// replacing the two registers with the memory location.
	if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
	MI.getOperand(1).isReg() &&
	MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
	OpcodeTablePtr = &RegOp2MemOpTable2Addr;
	isTwoAddrFold = true;
	} else if (OpNum == 0) {
	if (MI.getOpcode() == X86::MOV32r0) {
	NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
	if (NewMI)
	return NewMI;
	}

	OpcodeTablePtr = &RegOp2MemOpTable0;
	} else if (OpNum == 1) {
	OpcodeTablePtr = &RegOp2MemOpTable1;
	} else if (OpNum == 2) {
	OpcodeTablePtr = &RegOp2MemOpTable2;
	} else if (OpNum == 3) {
	OpcodeTablePtr = &RegOp2MemOpTable3;
	} else if (OpNum == 4) {
	OpcodeTablePtr = &RegOp2MemOpTable4;
	}

	// If table selected...
	if (OpcodeTablePtr) {
	// Find the Opcode to fuse
	auto I = OpcodeTablePtr->find(MI.getOpcode());
	if (I != OpcodeTablePtr->end()) {
	unsigned Opcode = I->second.first;
	unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
	if (Align < MinAlign)
	return nullptr;
	bool NarrowToMOV32rm = false;
	if (Size) {
	unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
	if (Size < RCSize) {
	// Check if it's safe to fold the load. If the size of the object is
	// narrower than the load width, then it's not.
	if (Opcode != X86::MOV64rm \|\| RCSize != 8 \|\| Size != 4)
	return nullptr;
	// If this is a 64-bit load, but the spill slot is 32, then we can do
	// a 32-bit load which is implicitly zero-extended. This likely is
	// due to live interval analysis remat'ing a load from stack slot.
	if (MI.getOperand(0).getSubReg() \|\| MI.getOperand(1).getSubReg())
	return nullptr;
	Opcode = X86::MOV32rm;
	NarrowToMOV32rm = true;
	}
	}

	if (isTwoAddrFold)
	NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
	else
	NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);

	if (NarrowToMOV32rm) {
	// If this is the special case where we use a MOV32rm to load a 32-bit
	// value and zero-extend the top bits. Change the destination register
	// to a 32-bit one.
	unsigned DstReg = NewMI->getOperand(0).getReg();
	if (TargetRegisterInfo::isPhysicalRegister(DstReg))
	NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
	else
	NewMI->getOperand(0).setSubReg(X86::sub_32bit);
	}
	return NewMI;
	}
	}

	// If the instruction and target operand are commutable, commute the
	// instruction and try again.
	if (AllowCommute) {
	unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
	if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
	bool HasDef = MI.getDesc().getNumDefs();
	unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
	unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
	unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
	bool Tied1 =
	0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
	bool Tied2 =
	0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);

	// If either of the commutable operands are tied to the destination
	// then we can not commute + fold.
	if ((HasDef && Reg0 == Reg1 && Tied1) \|\|
	(HasDef && Reg0 == Reg2 && Tied2))
	return nullptr;

	MachineInstr *CommutedMI =
	commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
	if (!CommutedMI) {
	// Unable to commute.
	return nullptr;
	}
	if (CommutedMI != &MI) {
	// New instruction. We can't fold from this.
	CommutedMI->eraseFromParent();
	return nullptr;
	}

	// Attempt to fold with the commuted version of the instruction.
	NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
	Size, Align, /AllowCommute=/false);
	if (NewMI)
	return NewMI;

	// Folding failed again - undo the commute before returning.
	MachineInstr *UncommutedMI =
	commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
	if (!UncommutedMI) {
	// Unable to commute.
	return nullptr;
	}
	if (UncommutedMI != &MI) {
	// New instruction. It doesn't need to be kept.
	UncommutedMI->eraseFromParent();
	return nullptr;
	}

	// Return here to prevent duplicate fuse failure report.
	return nullptr;
	}
	}

	// No fusion
	if (PrintFailedFusing && !MI.isCopy())
	dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
	return nullptr;
	}

	/// Return true for all instructions that only update
	/// the first 32 or 64-bits of the destination register and leave the rest
	/// unmodified. This can be used to avoid folding loads if the instructions
	/// only update part of the destination register, and the non-updated part is
	/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
	/// instructions breaks the partial register dependency and it can improve
	/// performance. e.g.:
	///
	/// movss (%rdi), %xmm0
	/// cvtss2sd %xmm0, %xmm0
	///
	/// Instead of
	/// cvtss2sd (%rdi), %xmm0
	///
	/// FIXME: This should be turned into a TSFlags.
	///
	static bool hasPartialRegUpdate(unsigned Opcode) {
	switch (Opcode) {
	case X86::CVTSI2SSrr:
	case X86::CVTSI2SSrm:
	case X86::CVTSI2SS64rr:
	case X86::CVTSI2SS64rm:
	case X86::CVTSI2SDrr:
	case X86::CVTSI2SDrm:
	case X86::CVTSI2SD64rr:
	case X86::CVTSI2SD64rm:
	case X86::CVTSD2SSrr:
	case X86::CVTSD2SSrm:
	case X86::CVTSS2SDrr:
	case X86::CVTSS2SDrm:
	case X86::MOVHPDrm:
	case X86::MOVHPSrm:
	case X86::MOVLPDrm:
	case X86::MOVLPSrm:
	case X86::RCPSSr:
	case X86::RCPSSm:
	case X86::RCPSSr_Int:
	case X86::RCPSSm_Int:
	case X86::ROUNDSDr:
	case X86::ROUNDSDm:
	case X86::ROUNDSSr:
	case X86::ROUNDSSm:
	case X86::RSQRTSSr:
	case X86::RSQRTSSm:
	case X86::RSQRTSSr_Int:
	case X86::RSQRTSSm_Int:
	case X86::SQRTSSr:
	case X86::SQRTSSm:
	case X86::SQRTSSr_Int:
	case X86::SQRTSSm_Int:
	case X86::SQRTSDr:
	case X86::SQRTSDm:
	case X86::SQRTSDr_Int:
	case X86::SQRTSDm_Int:
	return true;
	}

	return false;
	}

	/// Inform the ExeDepsFix pass how many idle
	/// instructions we would like before a partial register update.
	unsigned X86InstrInfo::getPartialRegUpdateClearance(
	const MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const {
	if (OpNum != 0 \|\| !hasPartialRegUpdate(MI.getOpcode()))
	return 0;

	// If MI is marked as reading Reg, the partial register update is wanted.
	const MachineOperand &MO = MI.getOperand(0);
	unsigned Reg = MO.getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	if (MO.readsReg() \|\| MI.readsVirtualRegister(Reg))
	return 0;
	} else {
	if (MI.readsRegister(Reg, TRI))
	return 0;
	}

	// If any instructions in the clearance range are reading Reg, insert a
	// dependency breaking instruction, which is inexpensive and is likely to
	// be hidden in other instruction's cycles.
	return PartialRegUpdateClearance;
	}

	// Return true for any instruction the copies the high bits of the first source
	// operand into the unused high bits of the destination operand.
	static bool hasUndefRegUpdate(unsigned Opcode) {
	switch (Opcode) {
	case X86::VCVTSI2SSrr:
	case X86::VCVTSI2SSrm:
	case X86::Int_VCVTSI2SSrr:
	case X86::Int_VCVTSI2SSrm:
	case X86::VCVTSI2SS64rr:
	case X86::VCVTSI2SS64rm:
	case X86::Int_VCVTSI2SS64rr:
	case X86::Int_VCVTSI2SS64rm:
	case X86::VCVTSI2SDrr:
	case X86::VCVTSI2SDrm:
	case X86::Int_VCVTSI2SDrr:
	case X86::Int_VCVTSI2SDrm:
	case X86::VCVTSI2SD64rr:
	case X86::VCVTSI2SD64rm:
	case X86::Int_VCVTSI2SD64rr:
	case X86::Int_VCVTSI2SD64rm:
	case X86::VCVTSD2SSrr:
	case X86::VCVTSD2SSrm:
	case X86::Int_VCVTSD2SSrr:
	case X86::Int_VCVTSD2SSrm:
	case X86::VCVTSS2SDrr:
	case X86::VCVTSS2SDrm:
	case X86::Int_VCVTSS2SDrr:
	case X86::Int_VCVTSS2SDrm:
	case X86::VRCPSSr:
	case X86::VRCPSSr_Int:
	case X86::VRCPSSm:
	case X86::VRCPSSm_Int:
	case X86::VROUNDSDr:
	case X86::VROUNDSDm:
	case X86::VROUNDSDr_Int:
	case X86::VROUNDSDm_Int:
	case X86::VROUNDSSr:
	case X86::VROUNDSSm:
	case X86::VROUNDSSr_Int:
	case X86::VROUNDSSm_Int:
	case X86::VRSQRTSSr:
	case X86::VRSQRTSSr_Int:
	case X86::VRSQRTSSm:
	case X86::VRSQRTSSm_Int:
	case X86::VSQRTSSr:
	case X86::VSQRTSSr_Int:
	case X86::VSQRTSSm:
	case X86::VSQRTSSm_Int:
	case X86::VSQRTSDr:
	case X86::VSQRTSDr_Int:
	case X86::VSQRTSDm:
	case X86::VSQRTSDm_Int:
	// AVX-512
	case X86::VCVTSI2SSZrr:
	case X86::VCVTSI2SSZrm:
	case X86::VCVTSI2SSZrr_Int:
	case X86::VCVTSI2SSZrrb_Int:
	case X86::VCVTSI2SSZrm_Int:
	case X86::VCVTSI642SSZrr:
	case X86::VCVTSI642SSZrm:
	case X86::VCVTSI642SSZrr_Int:
	case X86::VCVTSI642SSZrrb_Int:
	case X86::VCVTSI642SSZrm_Int:
	case X86::VCVTSI2SDZrr:
	case X86::VCVTSI2SDZrm:
	case X86::VCVTSI2SDZrr_Int:
	case X86::VCVTSI2SDZrrb_Int:
	case X86::VCVTSI2SDZrm_Int:
	case X86::VCVTSI642SDZrr:
	case X86::VCVTSI642SDZrm:
	case X86::VCVTSI642SDZrr_Int:
	case X86::VCVTSI642SDZrrb_Int:
	case X86::VCVTSI642SDZrm_Int:
	case X86::VCVTUSI2SSZrr:
	case X86::VCVTUSI2SSZrm:
	case X86::VCVTUSI2SSZrr_Int:
	case X86::VCVTUSI2SSZrrb_Int:
	case X86::VCVTUSI2SSZrm_Int:
	case X86::VCVTUSI642SSZrr:
	case X86::VCVTUSI642SSZrm:
	case X86::VCVTUSI642SSZrr_Int:
	case X86::VCVTUSI642SSZrrb_Int:
	case X86::VCVTUSI642SSZrm_Int:
	case X86::VCVTUSI2SDZrr:
	case X86::VCVTUSI2SDZrm:
	case X86::VCVTUSI2SDZrr_Int:
	case X86::VCVTUSI2SDZrm_Int:
	case X86::VCVTUSI642SDZrr:
	case X86::VCVTUSI642SDZrm:
	case X86::VCVTUSI642SDZrr_Int:
	case X86::VCVTUSI642SDZrrb_Int:
	case X86::VCVTUSI642SDZrm_Int:
	case X86::VCVTSD2SSZrr:
	case X86::VCVTSD2SSZrrb:
	case X86::VCVTSD2SSZrm:
	case X86::VCVTSS2SDZrr:
	case X86::VCVTSS2SDZrrb:
	case X86::VCVTSS2SDZrm:
	case X86::VRNDSCALESDr:
	case X86::VRNDSCALESDrb:
	case X86::VRNDSCALESDm:
	case X86::VRNDSCALESSr:
	case X86::VRNDSCALESSrb:
	case X86::VRNDSCALESSm:
	case X86::VRCP14SSrr:
	case X86::VRCP14SSrm:
	case X86::VRSQRT14SSrr:
	case X86::VRSQRT14SSrm:
	case X86::VSQRTSSZr:
	case X86::VSQRTSSZr_Int:
	case X86::VSQRTSSZrb_Int:
	case X86::VSQRTSSZm:
	case X86::VSQRTSSZm_Int:
	case X86::VSQRTSDZr:
	case X86::VSQRTSDZr_Int:
	case X86::VSQRTSDZrb_Int:
	case X86::VSQRTSDZm:
	case X86::VSQRTSDZm_Int:
	return true;
	}

	return false;
	}

	/// Inform the ExeDepsFix pass how many idle instructions we would like before
	/// certain undef register reads.
	///
	/// This catches the VCVTSI2SD family of instructions:
	///
	/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
	///
	/// We should to be careful not to catch VXOR idioms which are presumably
	/// handled specially in the pipeline:
	///
	/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
	///
	/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
	/// high bits that are passed-through are not live.
	unsigned
	X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
	const TargetRegisterInfo *TRI) const {
	if (!hasUndefRegUpdate(MI.getOpcode()))
	return 0;

	// Set the OpNum parameter to the first source operand.
	OpNum = 1;

	const MachineOperand &MO = MI.getOperand(OpNum);
	if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
	return UndefRegClearance;
	}
	return 0;
	}

	void X86InstrInfo::breakPartialRegDependency(
	MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
	unsigned Reg = MI.getOperand(OpNum).getReg();
	// If MI kills this register, the false dependence is already broken.
	if (MI.killsRegister(Reg, TRI))
	return;

	if (X86::VR128RegClass.contains(Reg)) {
	// These instructions are all floating point domain, so xorps is the best
	// choice.
	unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
	.addReg(Reg, RegState::Undef)
	.addReg(Reg, RegState::Undef);
	MI.addRegisterKilled(Reg, TRI, true);
	} else if (X86::VR256RegClass.contains(Reg)) {
	// Use vxorps to clear the full ymm register.
	// It wants to read and write the xmm sub-register.
	unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
	BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
	.addReg(XReg, RegState::Undef)
	.addReg(XReg, RegState::Undef)
	.addReg(Reg, RegState::ImplicitDefine);
	MI.addRegisterKilled(Reg, TRI, true);
	}
	}

	MachineInstr *
	X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
	ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt,
	int FrameIndex, LiveIntervals *LIS) const {
	// Check switch flag
	if (NoFusing)
	return nullptr;

	// Unless optimizing for size, don't fold to avoid partial
	// register update stalls
	if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
	return nullptr;

	// Don't fold subreg spills, or reloads that use a high subreg.
	for (auto Op : Ops) {
	MachineOperand &MO = MI.getOperand(Op);
	auto SubReg = MO.getSubReg();
	if (SubReg && (MO.isDef() \|\| SubReg == X86::sub_8bit_hi))
	return nullptr;
	}

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned Size = MFI.getObjectSize(FrameIndex);
	unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
	// If the function stack isn't realigned we don't want to fold instructions
	// that need increased alignment.
	if (!RI.needsStackRealignment(MF))
	Alignment =
	std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
	if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
	unsigned NewOpc = 0;
	unsigned RCSize = 0;
	switch (MI.getOpcode()) {
	default: return nullptr;
	case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
	case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
	case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
	case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
	}
	// Check if it's safe to fold the load. If the size of the object is
	// narrower than the load width, then it's not.
	if (Size < RCSize)
	return nullptr;
	// Change to CMPXXri r, 0 first.
	MI.setDesc(get(NewOpc));
	MI.getOperand(1).ChangeToImmediate(0);
	} else if (Ops.size() != 1)
	return nullptr;

	return foldMemoryOperandImpl(MF, MI, Ops[0],
	MachineOperand::CreateFI(FrameIndex), InsertPt,
	Size, Alignment, /AllowCommute=/true);
	}

	/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
	/// because the latter uses contents that wouldn't be defined in the folded
	/// version. For instance, this transformation isn't legal:
	/// movss (%rdi), %xmm0
	/// addps %xmm0, %xmm0
	/// ->
	/// addps (%rdi), %xmm0
	///
	/// But this one is:
	/// movss (%rdi), %xmm0
	/// addss %xmm0, %xmm0
	/// ->
	/// addss (%rdi), %xmm0
	///
	static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
	const MachineInstr &UserMI,
	const MachineFunction &MF) {
	unsigned Opc = LoadMI.getOpcode();
	unsigned UserOpc = UserMI.getOpcode();
	unsigned RegSize =
	MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();

	if ((Opc == X86::MOVSSrm \|\| Opc == X86::VMOVSSrm \|\| Opc == X86::VMOVSSZrm) &&
	RegSize > 4) {
	// These instructions only load 32 bits, we can't fold them if the
	// destination register is wider than 32 bits (4 bytes), and its user
	// instruction isn't scalar (SS).
	switch (UserOpc) {
	case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
	case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
	case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
	case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
	case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
	case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
	case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
	case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
	case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
	case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
	case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
	case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
	case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
	case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
	case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
	case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
	case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
	case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
	case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
	case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
	case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
	return false;
	default:
	return true;
	}
	}

	if ((Opc == X86::MOVSDrm \|\| Opc == X86::VMOVSDrm \|\| Opc == X86::VMOVSDZrm) &&
	RegSize > 8) {
	// These instructions only load 64 bits, we can't fold them if the
	// destination register is wider than 64 bits (8 bytes), and its user
	// instruction isn't scalar (SD).
	switch (UserOpc) {
	case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
	case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
	case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
	case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
	case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
	case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
	case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
	case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
	case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
	case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
	case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
	case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
	case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
	case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
	case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
	case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
	case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
	case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
	case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
	case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
	case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
	return false;
	default:
	return true;
	}
	}

	return false;
	}

	MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
	LiveIntervals *LIS) const {

	// TODO: Support the case where LoadMI loads a wide register, but MI
	// only uses a subreg.
	for (auto Op : Ops) {
	if (MI.getOperand(Op).getSubReg())
	return nullptr;
	}

	// If loading from a FrameIndex, fold directly from the FrameIndex.
	unsigned NumOps = LoadMI.getDesc().getNumOperands();
	int FrameIndex;
	if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
	if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
	return nullptr;
	return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
	}

	// Check switch flag
	if (NoFusing) return nullptr;

	// Avoid partial register update stalls unless optimizing for size.
	if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
	return nullptr;

	// Determine the alignment of the load.
	unsigned Alignment = 0;
	if (LoadMI.hasOneMemOperand())
	Alignment = (*LoadMI.memoperands_begin())->getAlignment();
	else
	switch (LoadMI.getOpcode()) {
	case X86::AVX512_512_SET0:
	case X86::AVX512_512_SETALLONES:
	Alignment = 64;
	break;
	case X86::AVX2_SETALLONES:
	case X86::AVX_SET0:
	case X86::AVX512_256_SET0:
	Alignment = 32;
	break;
	case X86::V_SET0:
	case X86::V_SETALLONES:
	case X86::AVX512_128_SET0:
	Alignment = 16;
	break;
	case X86::FsFLD0SD:
	case X86::AVX512_FsFLD0SD:
	Alignment = 8;
	break;
	case X86::FsFLD0SS:
	case X86::AVX512_FsFLD0SS:
	Alignment = 4;
	break;
	default:
	return nullptr;
	}
	if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
	unsigned NewOpc = 0;
	switch (MI.getOpcode()) {
	default: return nullptr;
	case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
	case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
	case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
	case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
	}
	// Change to CMPXXri r, 0 first.
	MI.setDesc(get(NewOpc));
	MI.getOperand(1).ChangeToImmediate(0);
	} else if (Ops.size() != 1)
	return nullptr;

	// Make sure the subregisters match.
	// Otherwise we risk changing the size of the load.
	if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
	return nullptr;

	SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
	switch (LoadMI.getOpcode()) {
	case X86::V_SET0:
	case X86::V_SETALLONES:
	case X86::AVX2_SETALLONES:
	case X86::AVX_SET0:
	case X86::AVX512_128_SET0:
	case X86::AVX512_256_SET0:
	case X86::AVX512_512_SET0:
	case X86::AVX512_512_SETALLONES:
	case X86::FsFLD0SD:
	case X86::AVX512_FsFLD0SD:
	case X86::FsFLD0SS:
	case X86::AVX512_FsFLD0SS: {
	// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
	// Create a constant-pool entry and operands to load from it.

	// Medium and large mode can't fold loads this way.
	if (MF.getTarget().getCodeModel() != CodeModel::Small &&
	MF.getTarget().getCodeModel() != CodeModel::Kernel)
	return nullptr;

	// x86-32 PIC requires a PIC base register for constant pools.
	unsigned PICBase = 0;
	if (MF.getTarget().isPositionIndependent()) {
	if (Subtarget.is64Bit())
	PICBase = X86::RIP;
	else
	// FIXME: PICBase = getGlobalBaseReg(&MF);
	// This doesn't work for several reasons.
	// 1. GlobalBaseReg may have been spilled.
	// 2. It may not be live at MI.
	return nullptr;
	}

	// Create a constant-pool entry.
	MachineConstantPool &MCP = *MF.getConstantPool();
	Type *Ty;
	unsigned Opc = LoadMI.getOpcode();
	if (Opc == X86::FsFLD0SS \|\| Opc == X86::AVX512_FsFLD0SS)
	Ty = Type::getFloatTy(MF.getFunction()->getContext());
	else if (Opc == X86::FsFLD0SD \|\| Opc == X86::AVX512_FsFLD0SD)
	Ty = Type::getDoubleTy(MF.getFunction()->getContext());
	else if (Opc == X86::AVX512_512_SET0 \|\| Opc == X86::AVX512_512_SETALLONES)
	Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
	else if (Opc == X86::AVX2_SETALLONES \|\| Opc == X86::AVX_SET0 \|\|
	Opc == X86::AVX512_256_SET0)
	Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
	else
	Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);

	bool IsAllOnes = (Opc == X86::V_SETALLONES \|\| Opc == X86::AVX2_SETALLONES \|\|
	Opc == X86::AVX512_512_SETALLONES);
	const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
	Constant::getNullValue(Ty);
	unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);

	// Create operands to load from the constant pool entry.
	MOs.push_back(MachineOperand::CreateReg(PICBase, false));
	MOs.push_back(MachineOperand::CreateImm(1));
	MOs.push_back(MachineOperand::CreateReg(0, false));
	MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
	MOs.push_back(MachineOperand::CreateReg(0, false));
	break;
	}
	default: {
	if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
	return nullptr;

	// Folding a normal load. Just copy the load's address operands.
	MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
	LoadMI.operands_begin() + NumOps);
	break;
	}
	}
	return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
	/Size=/0, Alignment, /AllowCommute=/true);
	}

	bool X86InstrInfo::unfoldMemoryOperand(
	MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
	bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
	auto I = MemOp2RegOpTable.find(MI.getOpcode());
	if (I == MemOp2RegOpTable.end())
	return false;
	unsigned Opc = I->second.first;
	unsigned Index = I->second.second & TB_INDEX_MASK;
	bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
	bool FoldedStore = I->second.second & TB_FOLDED_STORE;
	if (UnfoldLoad && !FoldedLoad)
	return false;
	UnfoldLoad &= FoldedLoad;
	if (UnfoldStore && !FoldedStore)
	return false;
	UnfoldStore &= FoldedStore;

	const MCInstrDesc &MCID = get(Opc);
	const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
	// TODO: Check if 32-byte or greater accesses are slow too?
	if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
	Subtarget.isUnalignedMem16Slow())
	// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
	// conservatively assume the address is unaligned. That's bad for
	// performance.
	return false;
	SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
	SmallVector<MachineOperand,2> BeforeOps;
	SmallVector<MachineOperand,2> AfterOps;
	SmallVector<MachineOperand,4> ImpOps;
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (i >= Index && i < Index + X86::AddrNumOperands)
	AddrOps.push_back(Op);
	else if (Op.isReg() && Op.isImplicit())
	ImpOps.push_back(Op);
	else if (i < Index)
	BeforeOps.push_back(Op);
	else if (i > Index)
	AfterOps.push_back(Op);
	}

	// Emit the load instruction.
	if (UnfoldLoad) {
	std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
	MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
	if (UnfoldStore) {
	// Address operands cannot be marked isKill.
	for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
	MachineOperand &MO = NewMIs[0]->getOperand(i);
	if (MO.isReg())
	MO.setIsKill(false);
	}
	}
	}

	// Emit the data processing instruction.
	MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
	MachineInstrBuilder MIB(MF, DataMI);

	if (FoldedStore)
	MIB.addReg(Reg, RegState::Define);
	for (MachineOperand &BeforeOp : BeforeOps)
	MIB.addOperand(BeforeOp);
	if (FoldedLoad)
	MIB.addReg(Reg);
	for (MachineOperand &AfterOp : AfterOps)
	MIB.addOperand(AfterOp);
	for (MachineOperand &ImpOp : ImpOps) {
	MIB.addReg(ImpOp.getReg(),
	getDefRegState(ImpOp.isDef()) \|
	RegState::Implicit \|
	getKillRegState(ImpOp.isKill()) \|
	getDeadRegState(ImpOp.isDead()) \|
	getUndefRegState(ImpOp.isUndef()));
	}
	// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
	switch (DataMI->getOpcode()) {
	default: break;
	case X86::CMP64ri32:
	case X86::CMP64ri8:
	case X86::CMP32ri:
	case X86::CMP32ri8:
	case X86::CMP16ri:
	case X86::CMP16ri8:
	case X86::CMP8ri: {
	MachineOperand &MO0 = DataMI->getOperand(0);
	MachineOperand &MO1 = DataMI->getOperand(1);
	if (MO1.getImm() == 0) {
	unsigned NewOpc;
	switch (DataMI->getOpcode()) {
	default: llvm_unreachable("Unreachable!");
	case X86::CMP64ri8:
	case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
	case X86::CMP32ri8:
	case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
	case X86::CMP16ri8:
	case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
	case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
	}
	DataMI->setDesc(get(NewOpc));
	MO1.ChangeToRegister(MO0.getReg(), false);
	}
	}
	}
	NewMIs.push_back(DataMI);

	// Emit the store instruction.
	if (UnfoldStore) {
	const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
	std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
	MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end());
	storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
	}

	return true;
	}

	bool
	X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
	SmallVectorImpl<SDNode*> &NewNodes) const {
	if (!N->isMachineOpcode())
	return false;

	auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
	if (I == MemOp2RegOpTable.end())
	return false;
	unsigned Opc = I->second.first;
	unsigned Index = I->second.second & TB_INDEX_MASK;
	bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
	bool FoldedStore = I->second.second & TB_FOLDED_STORE;
	const MCInstrDesc &MCID = get(Opc);
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
	unsigned NumDefs = MCID.NumDefs;
	std::vector<SDValue> AddrOps;
	std::vector<SDValue> BeforeOps;
	std::vector<SDValue> AfterOps;
	SDLoc dl(N);
	unsigned NumOps = N->getNumOperands();
	for (unsigned i = 0; i != NumOps-1; ++i) {
	SDValue Op = N->getOperand(i);
	if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
	AddrOps.push_back(Op);
	else if (i < Index-NumDefs)
	BeforeOps.push_back(Op);
	else if (i > Index-NumDefs)
	AfterOps.push_back(Op);
	}
	SDValue Chain = N->getOperand(NumOps-1);
	AddrOps.push_back(Chain);

	// Emit the load instruction.
	SDNode *Load = nullptr;
	if (FoldedLoad) {
	EVT VT = *RC->vt_begin();
	std::pair<MachineInstr::mmo_iterator,
	MachineInstr::mmo_iterator> MMOs =
	MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
	cast<MachineSDNode>(N)->memoperands_end());
	if (!(*MMOs.first) &&
	RC == &X86::VR128RegClass &&
	Subtarget.isUnalignedMem16Slow())
	// Do not introduce a slow unaligned load.
	return false;
	// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
	// memory access is slow above.
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned = (*MMOs.first) &&
	(*MMOs.first)->getAlignment() >= Alignment;
	Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
	VT, MVT::Other, AddrOps);
	NewNodes.push_back(Load);

	// Preserve memory reference information.
	cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
	}

	// Emit the data processing instruction.
	std::vector<EVT> VTs;
	const TargetRegisterClass *DstRC = nullptr;
	if (MCID.getNumDefs() > 0) {
	DstRC = getRegClass(MCID, 0, &RI, MF);
	VTs.push_back(*DstRC->vt_begin());
	}
	for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
	EVT VT = N->getValueType(i);
	if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
	VTs.push_back(VT);
	}
	if (Load)
	BeforeOps.push_back(SDValue(Load, 0));
	BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
	SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
	NewNodes.push_back(NewNode);

	// Emit the store instruction.
	if (FoldedStore) {
	AddrOps.pop_back();
	AddrOps.push_back(SDValue(NewNode, 0));
	AddrOps.push_back(Chain);
	std::pair<MachineInstr::mmo_iterator,
	MachineInstr::mmo_iterator> MMOs =
	MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
	cast<MachineSDNode>(N)->memoperands_end());
	if (!(*MMOs.first) &&
	RC == &X86::VR128RegClass &&
	Subtarget.isUnalignedMem16Slow())
	// Do not introduce a slow unaligned store.
	return false;
	// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
	// memory access is slow above.
	unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
	bool isAligned = (*MMOs.first) &&
	(*MMOs.first)->getAlignment() >= Alignment;
	SDNode *Store =
	DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
	dl, MVT::Other, AddrOps);
	NewNodes.push_back(Store);

	// Preserve memory reference information.
	cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
	}

	return true;
	}

	unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
	bool UnfoldLoad, bool UnfoldStore,
	unsigned *LoadRegIndex) const {
	auto I = MemOp2RegOpTable.find(Opc);
	if (I == MemOp2RegOpTable.end())
	return 0;
	bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
	bool FoldedStore = I->second.second & TB_FOLDED_STORE;
	if (UnfoldLoad && !FoldedLoad)
	return 0;
	if (UnfoldStore && !FoldedStore)
	return 0;
	if (LoadRegIndex)
	*LoadRegIndex = I->second.second & TB_INDEX_MASK;
	return I->second.first;
	}

	bool
	X86InstrInfo::areLoadsFromSameBasePtr(SDNode Load1, SDNode Load2,
	int64_t &Offset1, int64_t &Offset2) const {
	if (!Load1->isMachineOpcode() \|\| !Load2->isMachineOpcode())
	return false;
	unsigned Opc1 = Load1->getMachineOpcode();
	unsigned Opc2 = Load2->getMachineOpcode();
	switch (Opc1) {
	default: return false;
	case X86::MOV8rm:
	case X86::MOV16rm:
	case X86::MOV32rm:
	case X86::MOV64rm:
	case X86::LD_Fp32m:
	case X86::LD_Fp64m:
	case X86::LD_Fp80m:
	case X86::MOVSSrm:
	case X86::MOVSDrm:
	case X86::MMX_MOVD64rm:
	case X86::MMX_MOVQ64rm:
	case X86::MOVAPSrm:
	case X86::MOVUPSrm:
	case X86::MOVAPDrm:
	case X86::MOVUPDrm:
	case X86::MOVDQArm:
	case X86::MOVDQUrm:
	// AVX load instructions
	case X86::VMOVSSrm:
	case X86::VMOVSDrm:
	case X86::VMOVAPSrm:
	case X86::VMOVUPSrm:
	case X86::VMOVAPDrm:
	case X86::VMOVUPDrm:
	case X86::VMOVDQArm:
	case X86::VMOVDQUrm:
	case X86::VMOVAPSYrm:
	case X86::VMOVUPSYrm:
	case X86::VMOVAPDYrm:
	case X86::VMOVUPDYrm:
	case X86::VMOVDQAYrm:
	case X86::VMOVDQUYrm:
	// AVX512 load instructions
	case X86::VMOVSSZrm:
	case X86::VMOVSDZrm:
	case X86::VMOVAPSZ128rm:
	case X86::VMOVUPSZ128rm:
	case X86::VMOVAPSZ128rm_NOVLX:
	case X86::VMOVUPSZ128rm_NOVLX:
	case X86::VMOVAPDZ128rm:
	case X86::VMOVUPDZ128rm:
	case X86::VMOVDQU8Z128rm:
	case X86::VMOVDQU16Z128rm:
	case X86::VMOVDQA32Z128rm:
	case X86::VMOVDQU32Z128rm:
	case X86::VMOVDQA64Z128rm:
	case X86::VMOVDQU64Z128rm:
	case X86::VMOVAPSZ256rm:
	case X86::VMOVUPSZ256rm:
	case X86::VMOVAPSZ256rm_NOVLX:
	case X86::VMOVUPSZ256rm_NOVLX:
	case X86::VMOVAPDZ256rm:
	case X86::VMOVUPDZ256rm:
	case X86::VMOVDQU8Z256rm:
	case X86::VMOVDQU16Z256rm:
	case X86::VMOVDQA32Z256rm:
	case X86::VMOVDQU32Z256rm:
	case X86::VMOVDQA64Z256rm:
	case X86::VMOVDQU64Z256rm:
	case X86::VMOVAPSZrm:
	case X86::VMOVUPSZrm:
	case X86::VMOVAPDZrm:
	case X86::VMOVUPDZrm:
	case X86::VMOVDQU8Zrm:
	case X86::VMOVDQU16Zrm:
	case X86::VMOVDQA32Zrm:
	case X86::VMOVDQU32Zrm:
	case X86::VMOVDQA64Zrm:
	case X86::VMOVDQU64Zrm:
	case X86::KMOVBkm:
	case X86::KMOVWkm:
	case X86::KMOVDkm:
	case X86::KMOVQkm:
	break;
	}
	switch (Opc2) {
	default: return false;
	case X86::MOV8rm:
	case X86::MOV16rm:
	case X86::MOV32rm:
	case X86::MOV64rm:
	case X86::LD_Fp32m:
	case X86::LD_Fp64m:
	case X86::LD_Fp80m:
	case X86::MOVSSrm:
	case X86::MOVSDrm:
	case X86::MMX_MOVD64rm:
	case X86::MMX_MOVQ64rm:
	case X86::MOVAPSrm:
	case X86::MOVUPSrm:
	case X86::MOVAPDrm:
	case X86::MOVUPDrm:
	case X86::MOVDQArm:
	case X86::MOVDQUrm:
	// AVX load instructions
	case X86::VMOVSSrm:
	case X86::VMOVSDrm:
	case X86::VMOVAPSrm:
	case X86::VMOVUPSrm:
	case X86::VMOVAPDrm:
	case X86::VMOVUPDrm:
	case X86::VMOVDQArm:
	case X86::VMOVDQUrm:
	case X86::VMOVAPSYrm:
	case X86::VMOVUPSYrm:
	case X86::VMOVAPDYrm:
	case X86::VMOVUPDYrm:
	case X86::VMOVDQAYrm:
	case X86::VMOVDQUYrm:
	// AVX512 load instructions
	case X86::VMOVSSZrm:
	case X86::VMOVSDZrm:
	case X86::VMOVAPSZ128rm:
	case X86::VMOVUPSZ128rm:
	case X86::VMOVAPSZ128rm_NOVLX:
	case X86::VMOVUPSZ128rm_NOVLX:
	case X86::VMOVAPDZ128rm:
	case X86::VMOVUPDZ128rm:
	case X86::VMOVDQU8Z128rm:
	case X86::VMOVDQU16Z128rm:
	case X86::VMOVDQA32Z128rm:
	case X86::VMOVDQU32Z128rm:
	case X86::VMOVDQA64Z128rm:
	case X86::VMOVDQU64Z128rm:
	case X86::VMOVAPSZ256rm:
	case X86::VMOVUPSZ256rm:
	case X86::VMOVAPSZ256rm_NOVLX:
	case X86::VMOVUPSZ256rm_NOVLX:
	case X86::VMOVAPDZ256rm:
	case X86::VMOVUPDZ256rm:
	case X86::VMOVDQU8Z256rm:
	case X86::VMOVDQU16Z256rm:
	case X86::VMOVDQA32Z256rm:
	case X86::VMOVDQU32Z256rm:
	case X86::VMOVDQA64Z256rm:
	case X86::VMOVDQU64Z256rm:
	case X86::VMOVAPSZrm:
	case X86::VMOVUPSZrm:
	case X86::VMOVAPDZrm:
	case X86::VMOVUPDZrm:
	case X86::VMOVDQU8Zrm:
	case X86::VMOVDQU16Zrm:
	case X86::VMOVDQA32Zrm:
	case X86::VMOVDQU32Zrm:
	case X86::VMOVDQA64Zrm:
	case X86::VMOVDQU64Zrm:
	case X86::KMOVBkm:
	case X86::KMOVWkm:
	case X86::KMOVDkm:
	case X86::KMOVQkm:
	break;
	}

	// Check if chain operands and base addresses match.
	if (Load1->getOperand(0) != Load2->getOperand(0) \|\|
	Load1->getOperand(5) != Load2->getOperand(5))
	return false;
	// Segment operands should match as well.
	if (Load1->getOperand(4) != Load2->getOperand(4))
	return false;
	// Scale should be 1, Index should be Reg0.
	if (Load1->getOperand(1) == Load2->getOperand(1) &&
	Load1->getOperand(2) == Load2->getOperand(2)) {
	if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
	return false;

	// Now let's examine the displacements.
	if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
	isa<ConstantSDNode>(Load2->getOperand(3))) {
	Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
	Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
	return true;
	}
	}
	return false;
	}

	bool X86InstrInfo::shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
	int64_t Offset1, int64_t Offset2,
	unsigned NumLoads) const {
	assert(Offset2 > Offset1);
	if ((Offset2 - Offset1) / 8 > 64)
	return false;

	unsigned Opc1 = Load1->getMachineOpcode();
	unsigned Opc2 = Load2->getMachineOpcode();
	if (Opc1 != Opc2)
	return false; // FIXME: overly conservative?

	switch (Opc1) {
	default: break;
	case X86::LD_Fp32m:
	case X86::LD_Fp64m:
	case X86::LD_Fp80m:
	case X86::MMX_MOVD64rm:
	case X86::MMX_MOVQ64rm:
	return false;
	}

	EVT VT = Load1->getValueType(0);
	switch (VT.getSimpleVT().SimpleTy) {
	default:
	// XMM registers. In 64-bit mode we can be a bit more aggressive since we
	// have 16 of them to play with.
	if (Subtarget.is64Bit()) {
	if (NumLoads >= 3)
	return false;
	} else if (NumLoads) {
	return false;
	}
	break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	case MVT::i64:
	case MVT::f32:
	case MVT::f64:
	if (NumLoads)
	return false;
	break;
	}

	return true;
	}

	bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
	const MachineInstr &Second) const {
	// Check if this processor supports macro-fusion. Since this is a minor
	// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
	// proxy for SandyBridge+.
	if (!Subtarget.hasAVX())
	return false;

	enum {
	FuseTest,
	FuseCmp,
	FuseInc
	} FuseKind;

	switch (Second.getOpcode()) {
	default:
	return false;
	case X86::JE_1:
	case X86::JNE_1:
	case X86::JL_1:
	case X86::JLE_1:
	case X86::JG_1:
	case X86::JGE_1:
	FuseKind = FuseInc;
	break;
	case X86::JB_1:
	case X86::JBE_1:
	case X86::JA_1:
	case X86::JAE_1:
	FuseKind = FuseCmp;
	break;
	case X86::JS_1:
	case X86::JNS_1:
	case X86::JP_1:
	case X86::JNP_1:
	case X86::JO_1:
	case X86::JNO_1:
	FuseKind = FuseTest;
	break;
	}
	switch (First.getOpcode()) {
	default:
	return false;
	case X86::TEST8rr:
	case X86::TEST16rr:
	case X86::TEST32rr:
	case X86::TEST64rr:
	case X86::TEST8ri:
	case X86::TEST16ri:
	case X86::TEST32ri:
	case X86::TEST32i32:
	case X86::TEST64i32:
	case X86::TEST64ri32:
	case X86::TEST8rm:
	case X86::TEST16rm:
	case X86::TEST32rm:
	case X86::TEST64rm:
	case X86::TEST8ri_NOREX:
	case X86::AND16i16:
	case X86::AND16ri:
	case X86::AND16ri8:
	case X86::AND16rm:
	case X86::AND16rr:
	case X86::AND32i32:
	case X86::AND32ri:
	case X86::AND32ri8:
	case X86::AND32rm:
	case X86::AND32rr:
	case X86::AND64i32:
	case X86::AND64ri32:
	case X86::AND64ri8:
	case X86::AND64rm:
	case X86::AND64rr:
	case X86::AND8i8:
	case X86::AND8ri:
	case X86::AND8rm:
	case X86::AND8rr:
	return true;
	case X86::CMP16i16:
	case X86::CMP16ri:
	case X86::CMP16ri8:
	case X86::CMP16rm:
	case X86::CMP16rr:
	case X86::CMP32i32:
	case X86::CMP32ri:
	case X86::CMP32ri8:
	case X86::CMP32rm:
	case X86::CMP32rr:
	case X86::CMP64i32:
	case X86::CMP64ri32:
	case X86::CMP64ri8:
	case X86::CMP64rm:
	case X86::CMP64rr:
	case X86::CMP8i8:
	case X86::CMP8ri:
	case X86::CMP8rm:
	case X86::CMP8rr:
	case X86::ADD16i16:
	case X86::ADD16ri:
	case X86::ADD16ri8:
	case X86::ADD16ri8_DB:
	case X86::ADD16ri_DB:
	case X86::ADD16rm:
	case X86::ADD16rr:
	case X86::ADD16rr_DB:
	case X86::ADD32i32:
	case X86::ADD32ri:
	case X86::ADD32ri8:
	case X86::ADD32ri8_DB:
	case X86::ADD32ri_DB:
	case X86::ADD32rm:
	case X86::ADD32rr:
	case X86::ADD32rr_DB:
	case X86::ADD64i32:
	case X86::ADD64ri32:
	case X86::ADD64ri32_DB:
	case X86::ADD64ri8:
	case X86::ADD64ri8_DB:
	case X86::ADD64rm:
	case X86::ADD64rr:
	case X86::ADD64rr_DB:
	case X86::ADD8i8:
	case X86::ADD8mi:
	case X86::ADD8mr:
	case X86::ADD8ri:
	case X86::ADD8rm:
	case X86::ADD8rr:
	case X86::SUB16i16:
	case X86::SUB16ri:
	case X86::SUB16ri8:
	case X86::SUB16rm:
	case X86::SUB16rr:
	case X86::SUB32i32:
	case X86::SUB32ri:
	case X86::SUB32ri8:
	case X86::SUB32rm:
	case X86::SUB32rr:
	case X86::SUB64i32:
	case X86::SUB64ri32:
	case X86::SUB64ri8:
	case X86::SUB64rm:
	case X86::SUB64rr:
	case X86::SUB8i8:
	case X86::SUB8ri:
	case X86::SUB8rm:
	case X86::SUB8rr:
	return FuseKind == FuseCmp \|\| FuseKind == FuseInc;
	case X86::INC16r:
	case X86::INC32r:
	case X86::INC64r:
	case X86::INC8r:
	case X86::DEC16r:
	case X86::DEC32r:
	case X86::DEC64r:
	case X86::DEC8r:
	return FuseKind == FuseInc;
	}
	}

	bool X86InstrInfo::
	reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
	assert(Cond.size() == 1 && "Invalid X86 branch condition!");
	X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
	Cond[0].setImm(GetOppositeBranchCondition(CC));
	return false;
	}

	bool X86InstrInfo::
	isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
	// FIXME: Return false for x87 stack register classes for now. We can't
	// allow any loads of these registers before FpGet_ST0_80.
	return !(RC == &X86::CCRRegClass \|\| RC == &X86::RFP32RegClass \|\|
	RC == &X86::RFP64RegClass \|\| RC == &X86::RFP80RegClass);
	}

	/// Return a virtual register initialized with the
	/// the global base register value. Output instructions required to
	/// initialize the register in the function entry block, if necessary.
	///
	/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
	///
	unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
	assert(!Subtarget.is64Bit() &&
	"X86-64 PIC uses RIP relative addressing");

	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
	if (GlobalBaseReg != 0)
	return GlobalBaseReg;

	// Create the register. The code to initialize it is inserted
	// later, by the CGBR pass (below).
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
	X86FI->setGlobalBaseReg(GlobalBaseReg);
	return GlobalBaseReg;
	}

	// These are the replaceable SSE instructions. Some of these have Int variants
	// that we don't include here. We don't want to replace instructions selected
	// by intrinsics.
	static const uint16_t ReplaceableInstrs[][3] = {
	//PackedSingle PackedDouble PackedInt
	{ X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
	{ X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
	{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
	{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
	{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
	{ X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
	{ X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
	{ X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
	{ X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
	{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
	{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
	{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
	{ X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
	{ X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
	{ X86::ORPSrm, X86::ORPDrm, X86::PORrm },
	{ X86::ORPSrr, X86::ORPDrr, X86::PORrr },
	{ X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
	{ X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
	// AVX 128-bit support
	{ X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
	{ X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
	{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
	{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
	{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
	{ X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
	{ X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
	{ X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
	{ X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
	{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
	{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
	{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
	{ X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
	{ X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
	{ X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
	{ X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
	{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
	{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
	// AVX 256-bit support
	{ X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
	{ X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
	{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
	{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
	{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
	{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
	// AVX512 support
	{ X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
	{ X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
	{ X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
	{ X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
	{ X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
	{ X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
	{ X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
	{ X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
	{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
	{ X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
	{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
	{ X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
	{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
	{ X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
	{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
	{ X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
	{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
	{ X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
	};

	static const uint16_t ReplaceableInstrsAVX2[][3] = {
	//PackedSingle PackedDouble PackedInt
	{ X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
	{ X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
	{ X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
	{ X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
	{ X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
	{ X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
	{ X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
	{ X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
	{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
	{ X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
	{ X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
	{ X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
	{ X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
	{ X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
	{ X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
	{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
	{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
	{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
	{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
	{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
	{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
	};

	static const uint16_t ReplaceableInstrsAVX512[][4] = {
	// Two integer columns for 64-bit and 32-bit elements.
	//PackedSingle PackedDouble PackedInt PackedInt
	{ X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
	{ X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
	{ X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
	{ X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
	{ X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
	{ X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
	{ X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
	{ X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
	{ X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
	{ X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
	{ X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
	{ X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
	{ X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
	{ X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
	{ X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
	};

	static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
	// Two integer columns for 64-bit and 32-bit elements.
	//PackedSingle PackedDouble PackedInt PackedInt
	{ X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
	{ X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
	{ X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
	{ X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
	{ X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
	{ X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
	{ X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
	{ X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
	{ X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
	{ X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
	{ X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
	{ X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
	{ X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
	{ X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
	{ X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
	{ X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
	{ X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
	{ X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
	{ X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
	{ X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
	{ X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
	{ X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
	{ X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
	{ X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
	};

	static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
	// Two integer columns for 64-bit and 32-bit elements.
	//PackedSingle PackedDouble
	//PackedInt PackedInt
	{ X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
	X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
	{ X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
	X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
	{ X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
	X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
	{ X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
	X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
	{ X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
	X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
	{ X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
	X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
	{ X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
	X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
	{ X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
	X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
	{ X86::VORPSZ128rmk, X86::VORPDZ128rmk,
	X86::VPORQZ128rmk, X86::VPORDZ128rmk },
	{ X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
	X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
	{ X86::VORPSZ128rrk, X86::VORPDZ128rrk,
	X86::VPORQZ128rrk, X86::VPORDZ128rrk },
	{ X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
	X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
	{ X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
	X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
	{ X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
	X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
	{ X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
	X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
	{ X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
	X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
	{ X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
	X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
	{ X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
	X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
	{ X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
	X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
	{ X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
	X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
	{ X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
	X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
	{ X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
	X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
	{ X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
	X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
	{ X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
	X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
	{ X86::VORPSZ256rmk, X86::VORPDZ256rmk,
	X86::VPORQZ256rmk, X86::VPORDZ256rmk },
	{ X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
	X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
	{ X86::VORPSZ256rrk, X86::VORPDZ256rrk,
	X86::VPORQZ256rrk, X86::VPORDZ256rrk },
	{ X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
	X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
	{ X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
	X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
	{ X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
	X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
	{ X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
	X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
	{ X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
	X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
	{ X86::VANDNPSZrmk, X86::VANDNPDZrmk,
	X86::VPANDNQZrmk, X86::VPANDNDZrmk },
	{ X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
	X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
	{ X86::VANDNPSZrrk, X86::VANDNPDZrrk,
	X86::VPANDNQZrrk, X86::VPANDNDZrrk },
	{ X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
	X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
	{ X86::VANDPSZrmk, X86::VANDPDZrmk,
	X86::VPANDQZrmk, X86::VPANDDZrmk },
	{ X86::VANDPSZrmkz, X86::VANDPDZrmkz,
	X86::VPANDQZrmkz, X86::VPANDDZrmkz },
	{ X86::VANDPSZrrk, X86::VANDPDZrrk,
	X86::VPANDQZrrk, X86::VPANDDZrrk },
	{ X86::VANDPSZrrkz, X86::VANDPDZrrkz,
	X86::VPANDQZrrkz, X86::VPANDDZrrkz },
	{ X86::VORPSZrmk, X86::VORPDZrmk,
	X86::VPORQZrmk, X86::VPORDZrmk },
	{ X86::VORPSZrmkz, X86::VORPDZrmkz,
	X86::VPORQZrmkz, X86::VPORDZrmkz },
	{ X86::VORPSZrrk, X86::VORPDZrrk,
	X86::VPORQZrrk, X86::VPORDZrrk },
	{ X86::VORPSZrrkz, X86::VORPDZrrkz,
	X86::VPORQZrrkz, X86::VPORDZrrkz },
	{ X86::VXORPSZrmk, X86::VXORPDZrmk,
	X86::VPXORQZrmk, X86::VPXORDZrmk },
	{ X86::VXORPSZrmkz, X86::VXORPDZrmkz,
	X86::VPXORQZrmkz, X86::VPXORDZrmkz },
	{ X86::VXORPSZrrk, X86::VXORPDZrrk,
	X86::VPXORQZrrk, X86::VPXORDZrrk },
	{ X86::VXORPSZrrkz, X86::VXORPDZrrkz,
	X86::VPXORQZrrkz, X86::VPXORDZrrkz },
	// Broadcast loads can be handled the same as masked operations to avoid
	// changing element size.
	{ X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
	X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
	{ X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
	X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
	{ X86::VORPSZ128rmb, X86::VORPDZ128rmb,
	X86::VPORQZ128rmb, X86::VPORDZ128rmb },
	{ X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
	X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
	{ X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
	X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
	{ X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
	X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
	{ X86::VORPSZ256rmb, X86::VORPDZ256rmb,
	X86::VPORQZ256rmb, X86::VPORDZ256rmb },
	{ X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
	X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
	{ X86::VANDNPSZrmb, X86::VANDNPDZrmb,
	X86::VPANDNQZrmb, X86::VPANDNDZrmb },
	{ X86::VANDPSZrmb, X86::VANDPDZrmb,
	X86::VPANDQZrmb, X86::VPANDDZrmb },
	{ X86::VANDPSZrmb, X86::VANDPDZrmb,
	X86::VPANDQZrmb, X86::VPANDDZrmb },
	{ X86::VORPSZrmb, X86::VORPDZrmb,
	X86::VPORQZrmb, X86::VPORDZrmb },
	{ X86::VXORPSZrmb, X86::VXORPDZrmb,
	X86::VPXORQZrmb, X86::VPXORDZrmb },
	{ X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
	X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
	{ X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
	X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
	{ X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
	X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
	{ X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
	X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
	{ X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
	X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
	{ X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
	X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
	{ X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
	X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
	{ X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
	X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
	{ X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
	X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
	{ X86::VANDPSZrmbk, X86::VANDPDZrmbk,
	X86::VPANDQZrmbk, X86::VPANDDZrmbk },
	{ X86::VANDPSZrmbk, X86::VANDPDZrmbk,
	X86::VPANDQZrmbk, X86::VPANDDZrmbk },
	{ X86::VORPSZrmbk, X86::VORPDZrmbk,
	X86::VPORQZrmbk, X86::VPORDZrmbk },
	{ X86::VXORPSZrmbk, X86::VXORPDZrmbk,
	X86::VPXORQZrmbk, X86::VPXORDZrmbk },
	{ X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
	X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
	{ X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
	X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
	{ X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
	X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
	{ X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
	X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
	{ X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
	X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
	{ X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
	X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
	{ X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
	X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
	{ X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
	X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
	{ X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
	X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
	{ X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
	X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
	{ X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
	X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
	{ X86::VORPSZrmbkz, X86::VORPDZrmbkz,
	X86::VPORQZrmbkz, X86::VPORDZrmbkz },
	{ X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
	X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
	};

	// FIXME: Some shuffle and unpack instructions have equivalents in different
	// domains, but they require a bit more work than just switching opcodes.

	static const uint16_t *lookup(unsigned opcode, unsigned domain,
	ArrayRef<uint16_t[3]> Table) {
	for (const uint16_t (&Row)[3] : Table)
	if (Row[domain-1] == opcode)
	return Row;
	return nullptr;
	}

	static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
	ArrayRef<uint16_t[4]> Table) {
	// If this is the integer domain make sure to check both integer columns.
	for (const uint16_t (&Row)[4] : Table)
	if (Row[domain-1] == opcode \|\| (domain == 3 && Row[3] == opcode))
	return Row;
	return nullptr;
	}

	std::pair<uint16_t, uint16_t>
	X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
	uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
	unsigned opcode = MI.getOpcode();
	uint16_t validDomains = 0;
	if (domain) {
	if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
	validDomains = 0xe;
	} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
	validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
	} else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
	validDomains = 0xe;
	} else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
	validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
	} else if (const uint16_t *table = lookupAVX512(opcode, domain,
	ReplaceableInstrsAVX512DQMasked)) {
	if (domain == 1 \|\| (domain == 3 && table[3] == opcode))
	validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
	else
	validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
	}
	}
	return std::make_pair(domain, validDomains);
	}

	void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
	assert(Domain>0 && Domain<4 && "Invalid execution domain");
	uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
	assert(dom && "Not an SSE instruction");
	const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
	if (!table) { // try the other table
	assert((Subtarget.hasAVX2() \|\| Domain < 3) &&
	"256-bit vector operations only available in AVX2");
	table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
	}
	if (!table) { // try the AVX512 table
	assert(Subtarget.hasAVX512() && "Requires AVX-512");
	table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
	// Don't change integer Q instructions to D instructions.
	if (table && Domain == 3 && table[3] == MI.getOpcode())
	Domain = 4;
	}
	if (!table) { // try the AVX512DQ table
	assert((Subtarget.hasDQI() \|\| Domain >= 3) && "Requires AVX-512DQ");
	table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
	// Don't change integer Q instructions to D instructions and
	// use D intructions if we started with a PS instruction.
	if (table && Domain == 3 && (dom == 1 \|\| table[3] == MI.getOpcode()))
	Domain = 4;
	}
	if (!table) { // try the AVX512DQMasked table
	assert((Subtarget.hasDQI() \|\| Domain >= 3) && "Requires AVX-512DQ");
	table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
	if (table && Domain == 3 && (dom == 1 \|\| table[3] == MI.getOpcode()))
	Domain = 4;
	}
	assert(table && "Cannot change domain");
	MI.setDesc(get(table[Domain - 1]));
	}

	/// Return the noop instruction to use for a noop.
	void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
	NopInst.setOpcode(X86::NOOP);
	}

	bool X86InstrInfo::isHighLatencyDef(int opc) const {
	switch (opc) {
	default: return false;
	case X86::DIVPDrm:
	case X86::DIVPDrr:
	case X86::DIVPSrm:
	case X86::DIVPSrr:
	case X86::DIVSDrm:
	case X86::DIVSDrm_Int:
	case X86::DIVSDrr:
	case X86::DIVSDrr_Int:
	case X86::DIVSSrm:
	case X86::DIVSSrm_Int:
	case X86::DIVSSrr:
	case X86::DIVSSrr_Int:
	case X86::SQRTPDm:
	case X86::SQRTPDr:
	case X86::SQRTPSm:
	case X86::SQRTPSr:
	case X86::SQRTSDm:
	case X86::SQRTSDm_Int:
	case X86::SQRTSDr:
	case X86::SQRTSDr_Int:
	case X86::SQRTSSm:
	case X86::SQRTSSm_Int:
	case X86::SQRTSSr:
	case X86::SQRTSSr_Int:
	// AVX instructions with high latency
	case X86::VDIVPDrm:
	case X86::VDIVPDrr:
	case X86::VDIVPDYrm:
	case X86::VDIVPDYrr:
	case X86::VDIVPSrm:
	case X86::VDIVPSrr:
	case X86::VDIVPSYrm:
	case X86::VDIVPSYrr:
	case X86::VDIVSDrm:
	case X86::VDIVSDrm_Int:
	case X86::VDIVSDrr:
	case X86::VDIVSDrr_Int:
	case X86::VDIVSSrm:
	case X86::VDIVSSrm_Int:
	case X86::VDIVSSrr:
	case X86::VDIVSSrr_Int:
	case X86::VSQRTPDm:
	case X86::VSQRTPDr:
	case X86::VSQRTPDYm:
	case X86::VSQRTPDYr:
	case X86::VSQRTPSm:
	case X86::VSQRTPSr:
	case X86::VSQRTPSYm:
	case X86::VSQRTPSYr:
	case X86::VSQRTSDm:
	case X86::VSQRTSDm_Int:
	case X86::VSQRTSDr:
	case X86::VSQRTSDr_Int:
	case X86::VSQRTSSm:
	case X86::VSQRTSSm_Int:
	case X86::VSQRTSSr:
	case X86::VSQRTSSr_Int:
	// AVX512 instructions with high latency
	case X86::VDIVPDZ128rm:
	case X86::VDIVPDZ128rmb:
	case X86::VDIVPDZ128rmbk:
	case X86::VDIVPDZ128rmbkz:
	case X86::VDIVPDZ128rmk:
	case X86::VDIVPDZ128rmkz:
	case X86::VDIVPDZ128rr:
	case X86::VDIVPDZ128rrk:
	case X86::VDIVPDZ128rrkz:
	case X86::VDIVPDZ256rm:
	case X86::VDIVPDZ256rmb:
	case X86::VDIVPDZ256rmbk:
	case X86::VDIVPDZ256rmbkz:
	case X86::VDIVPDZ256rmk:
	case X86::VDIVPDZ256rmkz:
	case X86::VDIVPDZ256rr:
	case X86::VDIVPDZ256rrk:
	case X86::VDIVPDZ256rrkz:
	case X86::VDIVPDZrb:
	case X86::VDIVPDZrbk:
	case X86::VDIVPDZrbkz:
	case X86::VDIVPDZrm:
	case X86::VDIVPDZrmb:
	case X86::VDIVPDZrmbk:
	case X86::VDIVPDZrmbkz:
	case X86::VDIVPDZrmk:
	case X86::VDIVPDZrmkz:
	case X86::VDIVPDZrr:
	case X86::VDIVPDZrrk:
	case X86::VDIVPDZrrkz:
	case X86::VDIVPSZ128rm:
	case X86::VDIVPSZ128rmb:
	case X86::VDIVPSZ128rmbk:
	case X86::VDIVPSZ128rmbkz:
	case X86::VDIVPSZ128rmk:
	case X86::VDIVPSZ128rmkz:
	case X86::VDIVPSZ128rr:
	case X86::VDIVPSZ128rrk:
	case X86::VDIVPSZ128rrkz:
	case X86::VDIVPSZ256rm:
	case X86::VDIVPSZ256rmb:
	case X86::VDIVPSZ256rmbk:
	case X86::VDIVPSZ256rmbkz:
	case X86::VDIVPSZ256rmk:
	case X86::VDIVPSZ256rmkz:
	case X86::VDIVPSZ256rr:
	case X86::VDIVPSZ256rrk:
	case X86::VDIVPSZ256rrkz:
	case X86::VDIVPSZrb:
	case X86::VDIVPSZrbk:
	case X86::VDIVPSZrbkz:
	case X86::VDIVPSZrm:
	case X86::VDIVPSZrmb:
	case X86::VDIVPSZrmbk:
	case X86::VDIVPSZrmbkz:
	case X86::VDIVPSZrmk:
	case X86::VDIVPSZrmkz:
	case X86::VDIVPSZrr:
	case X86::VDIVPSZrrk:
	case X86::VDIVPSZrrkz:
	case X86::VDIVSDZrm:
	case X86::VDIVSDZrr:
	case X86::VDIVSDZrm_Int:
	case X86::VDIVSDZrm_Intk:
	case X86::VDIVSDZrm_Intkz:
	case X86::VDIVSDZrr_Int:
	case X86::VDIVSDZrr_Intk:
	case X86::VDIVSDZrr_Intkz:
	case X86::VDIVSDZrrb:
	case X86::VDIVSDZrrbk:
	case X86::VDIVSDZrrbkz:
	case X86::VDIVSSZrm:
	case X86::VDIVSSZrr:
	case X86::VDIVSSZrm_Int:
	case X86::VDIVSSZrm_Intk:
	case X86::VDIVSSZrm_Intkz:
	case X86::VDIVSSZrr_Int:
	case X86::VDIVSSZrr_Intk:
	case X86::VDIVSSZrr_Intkz:
	case X86::VDIVSSZrrb:
	case X86::VDIVSSZrrbk:
	case X86::VDIVSSZrrbkz:
	case X86::VSQRTPDZ128m:
	case X86::VSQRTPDZ128mb:
	case X86::VSQRTPDZ128mbk:
	case X86::VSQRTPDZ128mbkz:
	case X86::VSQRTPDZ128mk:
	case X86::VSQRTPDZ128mkz:
	case X86::VSQRTPDZ128r:
	case X86::VSQRTPDZ128rk:
	case X86::VSQRTPDZ128rkz:
	case X86::VSQRTPDZ256m:
	case X86::VSQRTPDZ256mb:
	case X86::VSQRTPDZ256mbk:
	case X86::VSQRTPDZ256mbkz:
	case X86::VSQRTPDZ256mk:
	case X86::VSQRTPDZ256mkz:
	case X86::VSQRTPDZ256r:
	case X86::VSQRTPDZ256rk:
	case X86::VSQRTPDZ256rkz:
	case X86::VSQRTPDZm:
	case X86::VSQRTPDZmb:
	case X86::VSQRTPDZmbk:
	case X86::VSQRTPDZmbkz:
	case X86::VSQRTPDZmk:
	case X86::VSQRTPDZmkz:
	case X86::VSQRTPDZr:
	case X86::VSQRTPDZrb:
	case X86::VSQRTPDZrbk:
	case X86::VSQRTPDZrbkz:
	case X86::VSQRTPDZrk:
	case X86::VSQRTPDZrkz:
	case X86::VSQRTPSZ128m:
	case X86::VSQRTPSZ128mb:
	case X86::VSQRTPSZ128mbk:
	case X86::VSQRTPSZ128mbkz:
	case X86::VSQRTPSZ128mk:
	case X86::VSQRTPSZ128mkz:
	case X86::VSQRTPSZ128r:
	case X86::VSQRTPSZ128rk:
	case X86::VSQRTPSZ128rkz:
	case X86::VSQRTPSZ256m:
	case X86::VSQRTPSZ256mb:
	case X86::VSQRTPSZ256mbk:
	case X86::VSQRTPSZ256mbkz:
	case X86::VSQRTPSZ256mk:
	case X86::VSQRTPSZ256mkz:
	case X86::VSQRTPSZ256r:
	case X86::VSQRTPSZ256rk:
	case X86::VSQRTPSZ256rkz:
	case X86::VSQRTPSZm:
	case X86::VSQRTPSZmb:
	case X86::VSQRTPSZmbk:
	case X86::VSQRTPSZmbkz:
	case X86::VSQRTPSZmk:
	case X86::VSQRTPSZmkz:
	case X86::VSQRTPSZr:
	case X86::VSQRTPSZrb:
	case X86::VSQRTPSZrbk:
	case X86::VSQRTPSZrbkz:
	case X86::VSQRTPSZrk:
	case X86::VSQRTPSZrkz:
	case X86::VSQRTSDZm:
	case X86::VSQRTSDZm_Int:
	case X86::VSQRTSDZm_Intk:
	case X86::VSQRTSDZm_Intkz:
	case X86::VSQRTSDZr:
	case X86::VSQRTSDZr_Int:
	case X86::VSQRTSDZr_Intk:
	case X86::VSQRTSDZr_Intkz:
	case X86::VSQRTSDZrb_Int:
	case X86::VSQRTSDZrb_Intk:
	case X86::VSQRTSDZrb_Intkz:
	case X86::VSQRTSSZm:
	case X86::VSQRTSSZm_Int:
	case X86::VSQRTSSZm_Intk:
	case X86::VSQRTSSZm_Intkz:
	case X86::VSQRTSSZr:
	case X86::VSQRTSSZr_Int:
	case X86::VSQRTSSZr_Intk:
	case X86::VSQRTSSZr_Intkz:
	case X86::VSQRTSSZrb_Int:
	case X86::VSQRTSSZrb_Intk:
	case X86::VSQRTSSZrb_Intkz:

	case X86::VGATHERDPDYrm:
	case X86::VGATHERDPDZ128rm:
	case X86::VGATHERDPDZ256rm:
	case X86::VGATHERDPDZrm:
	case X86::VGATHERDPDrm:
	case X86::VGATHERDPSYrm:
	case X86::VGATHERDPSZ128rm:
	case X86::VGATHERDPSZ256rm:
	case X86::VGATHERDPSZrm:
	case X86::VGATHERDPSrm:
	case X86::VGATHERPF0DPDm:
	case X86::VGATHERPF0DPSm:
	case X86::VGATHERPF0QPDm:
	case X86::VGATHERPF0QPSm:
	case X86::VGATHERPF1DPDm:
	case X86::VGATHERPF1DPSm:
	case X86::VGATHERPF1QPDm:
	case X86::VGATHERPF1QPSm:
	case X86::VGATHERQPDYrm:
	case X86::VGATHERQPDZ128rm:
	case X86::VGATHERQPDZ256rm:
	case X86::VGATHERQPDZrm:
	case X86::VGATHERQPDrm:
	case X86::VGATHERQPSYrm:
	case X86::VGATHERQPSZ128rm:
	case X86::VGATHERQPSZ256rm:
	case X86::VGATHERQPSZrm:
	case X86::VGATHERQPSrm:
	case X86::VPGATHERDDYrm:
	case X86::VPGATHERDDZ128rm:
	case X86::VPGATHERDDZ256rm:
	case X86::VPGATHERDDZrm:
	case X86::VPGATHERDDrm:
	case X86::VPGATHERDQYrm:
	case X86::VPGATHERDQZ128rm:
	case X86::VPGATHERDQZ256rm:
	case X86::VPGATHERDQZrm:
	case X86::VPGATHERDQrm:
	case X86::VPGATHERQDYrm:
	case X86::VPGATHERQDZ128rm:
	case X86::VPGATHERQDZ256rm:
	case X86::VPGATHERQDZrm:
	case X86::VPGATHERQDrm:
	case X86::VPGATHERQQYrm:
	case X86::VPGATHERQQZ128rm:
	case X86::VPGATHERQQZ256rm:
	case X86::VPGATHERQQZrm:
	case X86::VPGATHERQQrm:
	case X86::VSCATTERDPDZ128mr:
	case X86::VSCATTERDPDZ256mr:
	case X86::VSCATTERDPDZmr:
	case X86::VSCATTERDPSZ128mr:
	case X86::VSCATTERDPSZ256mr:
	case X86::VSCATTERDPSZmr:
	case X86::VSCATTERPF0DPDm:
	case X86::VSCATTERPF0DPSm:
	case X86::VSCATTERPF0QPDm:
	case X86::VSCATTERPF0QPSm:
	case X86::VSCATTERPF1DPDm:
	case X86::VSCATTERPF1DPSm:
	case X86::VSCATTERPF1QPDm:
	case X86::VSCATTERPF1QPSm:
	case X86::VSCATTERQPDZ128mr:
	case X86::VSCATTERQPDZ256mr:
	case X86::VSCATTERQPDZmr:
	case X86::VSCATTERQPSZ128mr:
	case X86::VSCATTERQPSZ256mr:
	case X86::VSCATTERQPSZmr:
	case X86::VPSCATTERDDZ128mr:
	case X86::VPSCATTERDDZ256mr:
	case X86::VPSCATTERDDZmr:
	case X86::VPSCATTERDQZ128mr:
	case X86::VPSCATTERDQZ256mr:
	case X86::VPSCATTERDQZmr:
	case X86::VPSCATTERQDZ128mr:
	case X86::VPSCATTERQDZ256mr:
	case X86::VPSCATTERQDZmr:
	case X86::VPSCATTERQQZ128mr:
	case X86::VPSCATTERQQZ256mr:
	case X86::VPSCATTERQQZmr:
	return true;
	}
	}

	bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
	const MachineRegisterInfo *MRI,
	const MachineInstr &DefMI,
	unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const {
	return isHighLatencyDef(DefMI.getOpcode());
	}

	bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
	const MachineBasicBlock *MBB) const {
	assert((Inst.getNumOperands() == 3 \|\| Inst.getNumOperands() == 4) &&
	"Reassociation needs binary operators");

	// Integer binary math/logic instructions have a third source operand:
	// the EFLAGS register. That operand must be both defined here and never
	// used; ie, it must be dead. If the EFLAGS operand is live, then we can
	// not change anything because rearranging the operands could affect other
	// instructions that depend on the exact status flags (zero, sign, etc.)
	// that are set by using these particular operands with this operation.
	if (Inst.getNumOperands() == 4) {
	assert(Inst.getOperand(3).isReg() &&
	Inst.getOperand(3).getReg() == X86::EFLAGS &&
	"Unexpected operand in reassociable instruction");
	if (!Inst.getOperand(3).isDead())
	return false;
	}

	return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
	}

	// TODO: There are many more machine instruction opcodes to match:
	// 1. Other data types (integer, vectors)
	// 2. Other math / logic operations (xor, or)
	// 3. Other forms of the same operation (intrinsics and other variants)
	bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
	switch (Inst.getOpcode()) {
	case X86::AND8rr:
	case X86::AND16rr:
	case X86::AND32rr:
	case X86::AND64rr:
	case X86::OR8rr:
	case X86::OR16rr:
	case X86::OR32rr:
	case X86::OR64rr:
	case X86::XOR8rr:
	case X86::XOR16rr:
	case X86::XOR32rr:
	case X86::XOR64rr:
	case X86::IMUL16rr:
	case X86::IMUL32rr:
	case X86::IMUL64rr:
	case X86::PANDrr:
	case X86::PORrr:
	case X86::PXORrr:
	case X86::ANDPDrr:
	case X86::ANDPSrr:
	case X86::ORPDrr:
	case X86::ORPSrr:
	case X86::XORPDrr:
	case X86::XORPSrr:
	case X86::PADDBrr:
	case X86::PADDWrr:
	case X86::PADDDrr:
	case X86::PADDQrr:
	case X86::VPANDrr:
	case X86::VPANDYrr:
	case X86::VPANDDZ128rr:
	case X86::VPANDDZ256rr:
	case X86::VPANDDZrr:
	case X86::VPANDQZ128rr:
	case X86::VPANDQZ256rr:
	case X86::VPANDQZrr:
	case X86::VPORrr:
	case X86::VPORYrr:
	case X86::VPORDZ128rr:
	case X86::VPORDZ256rr:
	case X86::VPORDZrr:
	case X86::VPORQZ128rr:
	case X86::VPORQZ256rr:
	case X86::VPORQZrr:
	case X86::VPXORrr:
	case X86::VPXORYrr:
	case X86::VPXORDZ128rr:
	case X86::VPXORDZ256rr:
	case X86::VPXORDZrr:
	case X86::VPXORQZ128rr:
	case X86::VPXORQZ256rr:
	case X86::VPXORQZrr:
	case X86::VANDPDrr:
	case X86::VANDPSrr:
	case X86::VANDPDYrr:
	case X86::VANDPSYrr:
	case X86::VANDPDZ128rr:
	case X86::VANDPSZ128rr:
	case X86::VANDPDZ256rr:
	case X86::VANDPSZ256rr:
	case X86::VANDPDZrr:
	case X86::VANDPSZrr:
	case X86::VORPDrr:
	case X86::VORPSrr:
	case X86::VORPDYrr:
	case X86::VORPSYrr:
	case X86::VORPDZ128rr:
	case X86::VORPSZ128rr:
	case X86::VORPDZ256rr:
	case X86::VORPSZ256rr:
	case X86::VORPDZrr:
	case X86::VORPSZrr:
	case X86::VXORPDrr:
	case X86::VXORPSrr:
	case X86::VXORPDYrr:
	case X86::VXORPSYrr:
	case X86::VXORPDZ128rr:
	case X86::VXORPSZ128rr:
	case X86::VXORPDZ256rr:
	case X86::VXORPSZ256rr:
	case X86::VXORPDZrr:
	case X86::VXORPSZrr:
	case X86::KADDBrr:
	case X86::KADDWrr:
	case X86::KADDDrr:
	case X86::KADDQrr:
	case X86::KANDBrr:
	case X86::KANDWrr:
	case X86::KANDDrr:
	case X86::KANDQrr:
	case X86::KORBrr:
	case X86::KORWrr:
	case X86::KORDrr:
	case X86::KORQrr:
	case X86::KXORBrr:
	case X86::KXORWrr:
	case X86::KXORDrr:
	case X86::KXORQrr:
	case X86::VPADDBrr:
	case X86::VPADDWrr:
	case X86::VPADDDrr:
	case X86::VPADDQrr:
	case X86::VPADDBYrr:
	case X86::VPADDWYrr:
	case X86::VPADDDYrr:
	case X86::VPADDQYrr:
	case X86::VPADDBZ128rr:
	case X86::VPADDWZ128rr:
	case X86::VPADDDZ128rr:
	case X86::VPADDQZ128rr:
	case X86::VPADDBZ256rr:
	case X86::VPADDWZ256rr:
	case X86::VPADDDZ256rr:
	case X86::VPADDQZ256rr:
	case X86::VPADDBZrr:
	case X86::VPADDWZrr:
	case X86::VPADDDZrr:
	case X86::VPADDQZrr:
	case X86::VPMULLWrr:
	case X86::VPMULLWYrr:
	case X86::VPMULLWZ128rr:
	case X86::VPMULLWZ256rr:
	case X86::VPMULLWZrr:
	case X86::VPMULLDrr:
	case X86::VPMULLDYrr:
	case X86::VPMULLDZ128rr:
	case X86::VPMULLDZ256rr:
	case X86::VPMULLDZrr:
	case X86::VPMULLQZ128rr:
	case X86::VPMULLQZ256rr:
	case X86::VPMULLQZrr:
	// Normal min/max instructions are not commutative because of NaN and signed
	// zero semantics, but these are. Thus, there's no need to check for global
	// relaxed math; the instructions themselves have the properties we need.
	case X86::MAXCPDrr:
	case X86::MAXCPSrr:
	case X86::MAXCSDrr:
	case X86::MAXCSSrr:
	case X86::MINCPDrr:
	case X86::MINCPSrr:
	case X86::MINCSDrr:
	case X86::MINCSSrr:
	case X86::VMAXCPDrr:
	case X86::VMAXCPSrr:
	case X86::VMAXCPDYrr:
	case X86::VMAXCPSYrr:
	case X86::VMAXCPDZ128rr:
	case X86::VMAXCPSZ128rr:
	case X86::VMAXCPDZ256rr:
	case X86::VMAXCPSZ256rr:
	case X86::VMAXCPDZrr:
	case X86::VMAXCPSZrr:
	case X86::VMAXCSDrr:
	case X86::VMAXCSSrr:
	case X86::VMAXCSDZrr:
	case X86::VMAXCSSZrr:
	case X86::VMINCPDrr:
	case X86::VMINCPSrr:
	case X86::VMINCPDYrr:
	case X86::VMINCPSYrr:
	case X86::VMINCPDZ128rr:
	case X86::VMINCPSZ128rr:
	case X86::VMINCPDZ256rr:
	case X86::VMINCPSZ256rr:
	case X86::VMINCPDZrr:
	case X86::VMINCPSZrr:
	case X86::VMINCSDrr:
	case X86::VMINCSSrr:
	case X86::VMINCSDZrr:
	case X86::VMINCSSZrr:
	return true;
	case X86::ADDPDrr:
	case X86::ADDPSrr:
	case X86::ADDSDrr:
	case X86::ADDSSrr:
	case X86::MULPDrr:
	case X86::MULPSrr:
	case X86::MULSDrr:
	case X86::MULSSrr:
	case X86::VADDPDrr:
	case X86::VADDPSrr:
	case X86::VADDPDYrr:
	case X86::VADDPSYrr:
	case X86::VADDPDZ128rr:
	case X86::VADDPSZ128rr:
	case X86::VADDPDZ256rr:
	case X86::VADDPSZ256rr:
	case X86::VADDPDZrr:
	case X86::VADDPSZrr:
	case X86::VADDSDrr:
	case X86::VADDSSrr:
	case X86::VADDSDZrr:
	case X86::VADDSSZrr:
	case X86::VMULPDrr:
	case X86::VMULPSrr:
	case X86::VMULPDYrr:
	case X86::VMULPSYrr:
	case X86::VMULPDZ128rr:
	case X86::VMULPSZ128rr:
	case X86::VMULPDZ256rr:
	case X86::VMULPSZ256rr:
	case X86::VMULPDZrr:
	case X86::VMULPSZrr:
	case X86::VMULSDrr:
	case X86::VMULSSrr:
	case X86::VMULSDZrr:
	case X86::VMULSSZrr:
	return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
	default:
	return false;
	}
	}

	/// This is an architecture-specific helper function of reassociateOps.
	/// Set special operand attributes for new instructions after reassociation.
	void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
	MachineInstr &OldMI2,
	MachineInstr &NewMI1,
	MachineInstr &NewMI2) const {
	// Integer instructions define an implicit EFLAGS source register operand as
	// the third source (fourth total) operand.
	if (OldMI1.getNumOperands() != 4 \|\| OldMI2.getNumOperands() != 4)
	return;

	assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
	"Unexpected instruction type for reassociation");

	MachineOperand &OldOp1 = OldMI1.getOperand(3);
	MachineOperand &OldOp2 = OldMI2.getOperand(3);
	MachineOperand &NewOp1 = NewMI1.getOperand(3);
	MachineOperand &NewOp2 = NewMI2.getOperand(3);

	assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
	"Must have dead EFLAGS operand in reassociable instruction");
	assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
	"Must have dead EFLAGS operand in reassociable instruction");

	(void)OldOp1;
	(void)OldOp2;

	assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
	"Unexpected operand in reassociable instruction");
	assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
	"Unexpected operand in reassociable instruction");

	// Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
	// of this pass or other passes. The EFLAGS operands must be dead in these new
	// instructions because the EFLAGS operands in the original instructions must
	// be dead in order for reassociation to occur.
	NewOp1.setIsDead();
	NewOp2.setIsDead();
	}

	std::pair<unsigned, unsigned>
	X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
	return std::make_pair(TF, 0u);
	}

	ArrayRef<std::pair<unsigned, const char *>>
	X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
	using namespace X86II;
	static const std::pair<unsigned, const char *> TargetFlags[] = {
	{MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
	{MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
	{MO_GOT, "x86-got"},
	{MO_GOTOFF, "x86-gotoff"},
	{MO_GOTPCREL, "x86-gotpcrel"},
	{MO_PLT, "x86-plt"},
	{MO_TLSGD, "x86-tlsgd"},
	{MO_TLSLD, "x86-tlsld"},
	{MO_TLSLDM, "x86-tlsldm"},
	{MO_GOTTPOFF, "x86-gottpoff"},
	{MO_INDNTPOFF, "x86-indntpoff"},
	{MO_TPOFF, "x86-tpoff"},
	{MO_DTPOFF, "x86-dtpoff"},
	{MO_NTPOFF, "x86-ntpoff"},
	{MO_GOTNTPOFF, "x86-gotntpoff"},
	{MO_DLLIMPORT, "x86-dllimport"},
	{MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
	{MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
	{MO_TLVP, "x86-tlvp"},
	{MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
	{MO_SECREL, "x86-secrel"}};
	return makeArrayRef(TargetFlags);
	}

	bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
	switch (Inst.getOpcode()) {
	case X86::TCRETURNdi:
	case X86::TCRETURNmi:
	case X86::TCRETURNri:
	case X86::TCRETURNdi64:
	case X86::TCRETURNmi64:
	case X86::TCRETURNri64:
	case X86::TAILJMPd:
	case X86::TAILJMPm:
	case X86::TAILJMPr:
	case X86::TAILJMPd64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64_REX:
	case X86::TAILJMPr64_REX:
	return true;
	default:
	return false;
	}
	}

	namespace {
	/// Create Global Base Reg pass. This initializes the PIC
	/// global base register for x86-32.
	struct CGBR : public MachineFunctionPass {
	static char ID;
	CGBR() : MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override {
	const X86TargetMachine *TM =
	static_cast<const X86TargetMachine *>(&MF.getTarget());
	const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();

	// Don't do anything if this is 64-bit as 64-bit PIC
	// uses RIP relative addressing.
	if (STI.is64Bit())
	return false;

	// Only emit a global base reg in PIC mode.
	if (!TM->isPositionIndependent())
	return false;

	X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
	unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();

	// If we didn't need a GlobalBaseReg, don't insert code.
	if (GlobalBaseReg == 0)
	return false;

	// Insert the set of GlobalBaseReg into the first MBB of the function
	MachineBasicBlock &FirstMBB = MF.front();
	MachineBasicBlock::iterator MBBI = FirstMBB.begin();
	DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	const X86InstrInfo *TII = STI.getInstrInfo();

	unsigned PC;
	if (STI.isPICStyleGOT())
	PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
	else
	PC = GlobalBaseReg;

	// Operand of MovePCtoStack is completely ignored by asm printer. It's
	// only used in JIT code emission as displacement to pc.
	BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);

	// If we're using vanilla 'GOT' PIC style, we should use relative addressing
	// not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
	if (STI.isPICStyleGOT()) {
	// Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
	BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
	.addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
	X86II::MO_GOT_ABSOLUTE_ADDRESS);
	}

	return true;
	}

	StringRef getPassName() const override {
	return "X86 PIC Global Base Reg Initialization";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};
	}

	char CGBR::ID = 0;
	FunctionPass*
	llvm::createX86GlobalBaseRegPass() { return new CGBR(); }

	namespace {
	struct LDTLSCleanup : public MachineFunctionPass {
	static char ID;
	LDTLSCleanup() : MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override {
	if (skipFunction(*MF.getFunction()))
	return false;

	X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
	if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
	// No point folding accesses if there isn't at least two.
	return false;
	}

	MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
	return VisitNode(DT->getRootNode(), 0);
	}

	// Visit the dominator subtree rooted at Node in pre-order.
	// If TLSBaseAddrReg is non-null, then use that to replace any
	// TLS_base_addr instructions. Otherwise, create the register
	// when the first such instruction is seen, and then use it
	// as we encounter more instructions.
	bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
	MachineBasicBlock *BB = Node->getBlock();
	bool Changed = false;

	// Traverse the current block.
	for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
	++I) {
	switch (I->getOpcode()) {
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	if (TLSBaseAddrReg)
	I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
	else
	I = SetRegister(*I, &TLSBaseAddrReg);
	Changed = true;
	break;
	default:
	break;
	}
	}

	// Visit the children of this block in the dominator tree.
	for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
	I != E; ++I) {
	Changed \|= VisitNode(*I, TLSBaseAddrReg);
	}

	return Changed;
	}

	// Replace the TLS_base_addr instruction I with a copy from
	// TLSBaseAddrReg, returning the new instruction.
	MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
	unsigned TLSBaseAddrReg) {
	MachineFunction *MF = I.getParent()->getParent();
	const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
	const bool is64Bit = STI.is64Bit();
	const X86InstrInfo *TII = STI.getInstrInfo();

	// Insert a Copy from TLSBaseAddrReg to RAX/EAX.
	MachineInstr *Copy =
	BuildMI(*I.getParent(), I, I.getDebugLoc(),
	TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
	.addReg(TLSBaseAddrReg);

	// Erase the TLS_base_addr instruction.
	I.eraseFromParent();

	return Copy;
	}

	// Create a virtal register in *TLSBaseAddrReg, and populate it by
	// inserting a copy instruction after I. Returns the new instruction.
	MachineInstr SetRegister(MachineInstr &I, unsigned TLSBaseAddrReg) {
	MachineFunction *MF = I.getParent()->getParent();
	const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
	const bool is64Bit = STI.is64Bit();
	const X86InstrInfo *TII = STI.getInstrInfo();

	// Create a virtual register for the TLS base address.
	MachineRegisterInfo &RegInfo = MF->getRegInfo();
	*TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
	? &X86::GR64RegClass
	: &X86::GR32RegClass);

	// Insert a copy from RAX/EAX to TLSBaseAddrReg.
	MachineInstr *Next = I.getNextNode();
	MachineInstr *Copy =
	BuildMI(*I.getParent(), Next, I.getDebugLoc(),
	TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
	.addReg(is64Bit ? X86::RAX : X86::EAX);

	return Copy;
	}

	StringRef getPassName() const override {
	return "Local Dynamic TLS Access Clean-up";
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();
	AU.addRequired<MachineDominatorTree>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};
	}

	char LDTLSCleanup::ID = 0;
	FunctionPass*
	llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
	Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h (revision 313643)
	@@ -1,608 +1,601 @@
	//===-- X86InstrInfo.h - X86 Instruction Information ------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the X86 implementation of the TargetInstrInfo class.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
	#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H

	#include "MCTargetDesc/X86BaseInfo.h"
	#include "X86InstrFMA3Info.h"
	#include "X86RegisterInfo.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/Target/TargetInstrInfo.h"

	#define GET_INSTRINFO_HEADER
	#include "X86GenInstrInfo.inc"

	namespace llvm {
	class MachineInstrBuilder;
	class X86RegisterInfo;
	class X86Subtarget;

	namespace X86 {
	// X86 specific condition code. These correspond to X86_*_COND in
	// X86InstrInfo.td. They must be kept in synch.
	enum CondCode {
	COND_A = 0,
	COND_AE = 1,
	COND_B = 2,
	COND_BE = 3,
	COND_E = 4,
	COND_G = 5,
	COND_GE = 6,
	COND_L = 7,
	COND_LE = 8,
	COND_NE = 9,
	COND_NO = 10,
	COND_NP = 11,
	COND_NS = 12,
	COND_O = 13,
	COND_P = 14,
	COND_S = 15,
	LAST_VALID_COND = COND_S,

	// Artificial condition codes. These are used by AnalyzeBranch
	// to indicate a block terminated with two conditional branches that together
	// form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
	// which can't be represented on x86 with a single condition. These
	// are never used in MachineInstrs and are inverses of one another.
	COND_NE_OR_P,
	COND_E_AND_NP,

	COND_INVALID
	};

	// Turn condition code into conditional branch opcode.
	unsigned GetCondBranchFromCond(CondCode CC);

	/// \brief Return a set opcode for the given condition and whether it has
	/// a memory operand.
	unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);

	/// \brief Return a cmov opcode for the given condition, register size in
	/// bytes, and operand type.
	unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
	bool HasMemoryOperand = false);

	// Turn CMov opcode into condition code.
	CondCode getCondFromCMovOpc(unsigned Opc);

	/// GetOppositeBranchCondition - Return the inverse of the specified cond,
	/// e.g. turning COND_E to COND_NE.
	CondCode GetOppositeBranchCondition(CondCode CC);
	} // end namespace X86;


	/// isGlobalStubReference - Return true if the specified TargetFlag operand is
	/// a reference to a stub for a global, not the global itself.
	inline static bool isGlobalStubReference(unsigned char TargetFlag) {
	switch (TargetFlag) {
	case X86II::MO_DLLIMPORT: // dllimport stub.
	case X86II::MO_GOTPCREL: // rip-relative GOT reference.
	case X86II::MO_GOT: // normal GOT reference.
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
	case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
	return true;
	default:
	return false;
	}
	}

	/// isGlobalRelativeToPICBase - Return true if the specified global value
	/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
	/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
	inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
	switch (TargetFlag) {
	case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
	case X86II::MO_GOT: // isPICStyleGOT: other global.
	case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
	case X86II::MO_TLVP: // ??? Pretty sure..
	return true;
	default:
	return false;
	}
	}

	inline static bool isScale(const MachineOperand &MO) {
	return MO.isImm() &&
	(MO.getImm() == 1 \|\| MO.getImm() == 2 \|\|
	MO.getImm() == 4 \|\| MO.getImm() == 8);
	}

	inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
	if (MI.getOperand(Op).isFI())
	return true;
	return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
	MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
	isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
	MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
	(MI.getOperand(Op + X86::AddrDisp).isImm() \|\|
	MI.getOperand(Op + X86::AddrDisp).isGlobal() \|\|
	MI.getOperand(Op + X86::AddrDisp).isCPI() \|\|
	MI.getOperand(Op + X86::AddrDisp).isJTI());
	}

	inline static bool isMem(const MachineInstr &MI, unsigned Op) {
	if (MI.getOperand(Op).isFI())
	return true;
	return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
	MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
	}

	class X86InstrInfo final : public X86GenInstrInfo {
	X86Subtarget &Subtarget;
	const X86RegisterInfo RI;

	/// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
	/// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
	///
	typedef DenseMap<unsigned,
	std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
	RegOp2MemOpTableType RegOp2MemOpTable2Addr;
	RegOp2MemOpTableType RegOp2MemOpTable0;
	RegOp2MemOpTableType RegOp2MemOpTable1;
	RegOp2MemOpTableType RegOp2MemOpTable2;
	RegOp2MemOpTableType RegOp2MemOpTable3;
	RegOp2MemOpTableType RegOp2MemOpTable4;

	/// MemOp2RegOpTable - Load / store unfolding opcode map.
	///
	typedef DenseMap<unsigned,
	std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
	MemOp2RegOpTableType MemOp2RegOpTable;

	static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
	MemOp2RegOpTableType &M2RTable,
	uint16_t RegOp, uint16_t MemOp, uint16_t Flags);

	virtual void anchor();

	bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	SmallVectorImpl<MachineInstr *> &CondBranches,
	bool AllowModify) const;

	public:
	explicit X86InstrInfo(X86Subtarget &STI);

	/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
	/// such, whenever a client has an instance of instruction info, it should
	/// always be able to get register info as well (through this method).
	///
	const X86RegisterInfo &getRegisterInfo() const { return RI; }

	/// getSPAdjust - This returns the stack pointer adjustment made by
	/// this instruction. For x86, we need to handle more complex call
	/// sequences involving PUSHes.
	int getSPAdjust(const MachineInstr &MI) const override;

	/// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
	/// extension instruction. That is, it's like a copy where it's legal for the
	/// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
	/// true, then it's expected the pre-extension value is available as a subreg
	/// of the result register. This also returns the sub-register index in
	/// SubIdx.
	bool isCoalescableExtInstr(const MachineInstr &MI,
	unsigned &SrcReg, unsigned &DstReg,
	unsigned &SubIdx) const override;

	unsigned isLoadFromStackSlot(const MachineInstr &MI,
	int &FrameIndex) const override;
	/// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
	/// stack locations as well. This uses a heuristic so it isn't
	/// reliable for correctness.
	unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const override;

	unsigned isStoreToStackSlot(const MachineInstr &MI,
	int &FrameIndex) const override;
	/// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
	/// stack locations as well. This uses a heuristic so it isn't
	/// reliable for correctness.
	unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
	int &FrameIndex) const override;

	bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
	AliasAnalysis *AA) const override;
	void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	unsigned DestReg, unsigned SubIdx,
	const MachineInstr &Orig,
	const TargetRegisterInfo &TRI) const override;

	/// Given an operand within a MachineInstr, insert preceding code to put it
	/// into the right format for a particular kind of LEA instruction. This may
	/// involve using an appropriate super-register instead (with an implicit use
	/// of the original) or creating a new virtual register and inserting COPY
	/// instructions to get the data into the right class.
	///
	/// Reference parameters are set to indicate how caller should add this
	/// operand to the LEA instruction.
	bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
	unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
	bool &isKill, bool &isUndef,
	MachineOperand &ImplicitOp, LiveVariables *LV) const;

	/// convertToThreeAddress - This method must be implemented by targets that
	/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
	/// may be able to convert a two-address instruction into a true
	/// three-address instruction on demand. This allows the X86 target (for
	/// example) to convert ADD and SHL instructions into LEA instructions if they
	/// would require register copies due to two-addressness.
	///
	/// This method returns a null pointer if the transformation cannot be
	/// performed, otherwise it returns the new instruction.
	///
	MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
	MachineInstr &MI,
	LiveVariables *LV) const override;

	/// Returns true iff the routine could find two commutable operands in the
	/// given machine instruction.
	/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
	/// input values can be re-defined in this method only if the input values
	/// are not pre-defined, which is designated by the special value
	/// 'CommuteAnyOperandIndex' assigned to it.
	/// If both of indices are pre-defined and refer to some operands, then the
	/// method simply returns true if the corresponding operands are commutable
	/// and returns false otherwise.
	///
	/// For example, calling this method this way:
	/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
	/// findCommutedOpIndices(MI, Op1, Op2);
	/// can be interpreted as a query asking to find an operand that would be
	/// commutable with the operand#1.
	bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const override;

	/// Returns true if the routine could find two commutable operands
	/// in the given FMA instruction \p MI. Otherwise, returns false.
	///
	/// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
	/// The output indices of the commuted operands are returned in these
	/// arguments. Also, the input values of these arguments may be preset either
	/// to indices of operands that must be commuted or be equal to a special
	/// value 'CommuteAnyOperandIndex' which means that the corresponding
	/// operand index is not set and this method is free to pick any of
	/// available commutable operands.
	/// The parameter \p FMA3Group keeps the reference to the group of relative
	/// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
	///
	/// For example, calling this method this way:
	/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
	/// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
	/// can be interpreted as a query asking if the operand #1 can be swapped
	/// with any other available operand (e.g. operand #2, operand #3, etc.).
	///
	/// The returned FMA opcode may differ from the opcode in the given MI.
	/// For example, commuting the operands #1 and #3 in the following FMA
	/// FMA213 #1, #2, #3
	/// results into instruction with adjusted opcode:
	/// FMA231 #3, #2, #1
	bool findFMA3CommutedOpIndices(const MachineInstr &MI,
	unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2,
	const X86InstrFMA3Group &FMA3Group) const;

	/// Returns an adjusted FMA opcode that must be used in FMA instruction that
	/// performs the same computations as the given \p MI but which has the
	/// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
	/// It may return 0 if it is unsafe to commute the operands.
	/// Note that a machine instruction (instead of its opcode) is passed as the
	/// first parameter to make it possible to analyze the instruction's uses and
	/// commute the first operand of FMA even when it seems unsafe when you look
	/// at the opcode. For example, it is Ok to commute the first operand of
	/// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
	///
	/// The returned FMA opcode may differ from the opcode in the given \p MI.
	/// For example, commuting the operands #1 and #3 in the following FMA
	/// FMA213 #1, #2, #3
	/// results into instruction with adjusted opcode:
	/// FMA231 #3, #2, #1
	unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
	unsigned SrcOpIdx1,
	unsigned SrcOpIdx2,
	const X86InstrFMA3Group &FMA3Group) const;

	// Branch analysis.
	bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
	- bool isUnconditionalTailCall(const MachineInstr &MI) const override;
	- bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
	- const MachineInstr &TailCall) const override;
	- void replaceBranchWithTailCall(MachineBasicBlock &MBB,
	- SmallVectorImpl<MachineOperand> &Cond,
	- const MachineInstr &TailCall) const override;
	-
	bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
	MachineBasicBlock *&FBB,
	SmallVectorImpl<MachineOperand> &Cond,
	bool AllowModify) const override;

	bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
	int64_t &Offset,
	const TargetRegisterInfo *TRI) const override;
	bool analyzeBranchPredicate(MachineBasicBlock &MBB,
	TargetInstrInfo::MachineBranchPredicate &MBP,
	bool AllowModify = false) const override;

	unsigned removeBranch(MachineBasicBlock &MBB,
	int *BytesRemoved = nullptr) const override;
	unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
	MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
	const DebugLoc &DL,
	int *BytesAdded = nullptr) const override;
	bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
	unsigned, unsigned, int&, int&, int&) const override;
	void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DstReg,
	ArrayRef<MachineOperand> Cond, unsigned TrueReg,
	unsigned FalseReg) const override;
	void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
	bool KillSrc) const override;
	void storeRegToStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned SrcReg, bool isKill, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const override;

	void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
	SmallVectorImpl<MachineOperand> &Addr,
	const TargetRegisterClass *RC,
	MachineInstr::mmo_iterator MMOBegin,
	MachineInstr::mmo_iterator MMOEnd,
	SmallVectorImpl<MachineInstr*> &NewMIs) const;

	void loadRegFromStackSlot(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MI,
	unsigned DestReg, int FrameIndex,
	const TargetRegisterClass *RC,
	const TargetRegisterInfo *TRI) const override;

	void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
	SmallVectorImpl<MachineOperand> &Addr,
	const TargetRegisterClass *RC,
	MachineInstr::mmo_iterator MMOBegin,
	MachineInstr::mmo_iterator MMOEnd,
	SmallVectorImpl<MachineInstr*> &NewMIs) const;

	bool expandPostRAPseudo(MachineInstr &MI) const override;

	/// Check whether the target can fold a load that feeds a subreg operand
	/// (or a subreg operand that feeds a store).
	bool isSubregFoldable() const override { return true; }

	/// foldMemoryOperand - If this target supports it, fold a load or store of
	/// the specified stack slot into the specified machine instruction for the
	/// specified operand(s). If this is possible, the target should perform the
	/// folding and return true, otherwise it should return false. If it folds
	/// the instruction, it is likely that the MachineInstruction the iterator
	/// references has been changed.
	MachineInstr *
	foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
	ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, int FrameIndex,
	LiveIntervals *LIS = nullptr) const override;

	/// foldMemoryOperand - Same as the previous version except it allows folding
	/// of any load and store from / to any address, not just from a specific
	/// stack slot.
	MachineInstr *foldMemoryOperandImpl(
	MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
	MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
	LiveIntervals *LIS = nullptr) const override;

	/// unfoldMemoryOperand - Separate a single instruction which folded a load or
	/// a store or a load and a store into two or more instruction. If this is
	/// possible, returns true as well as the new instructions by reference.
	bool
	unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
	bool UnfoldLoad, bool UnfoldStore,
	SmallVectorImpl<MachineInstr *> &NewMIs) const override;

	bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
	SmallVectorImpl<SDNode*> &NewNodes) const override;

	/// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
	/// instruction after load / store are unfolded from an instruction of the
	/// specified opcode. It returns zero if the specified unfolding is not
	/// possible. If LoadRegIndex is non-null, it is filled in with the operand
	/// index of the operand which will hold the register holding the loaded
	/// value.
	unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
	bool UnfoldLoad, bool UnfoldStore,
	unsigned *LoadRegIndex = nullptr) const override;

	/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
	/// to determine if two loads are loading from the same base address. It
	/// should only return true if the base pointers are the same and the
	/// only differences between the two addresses are the offset. It also returns
	/// the offsets by reference.
	bool areLoadsFromSameBasePtr(SDNode Load1, SDNode Load2, int64_t &Offset1,
	int64_t &Offset2) const override;

	/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
	/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
	/// be scheduled togther. On some targets if two loads are loading from
	/// addresses in the same cache line, it's better if they are scheduled
	/// together. This function takes two integers that represent the load offsets
	/// from the common base address. It returns true if it decides it's desirable
	/// to schedule the two loads together. "NumLoads" is the number of loads that
	/// have already been scheduled after Load1.
	bool shouldScheduleLoadsNear(SDNode Load1, SDNode Load2,
	int64_t Offset1, int64_t Offset2,
	unsigned NumLoads) const override;

	bool shouldScheduleAdjacent(const MachineInstr &First,
	const MachineInstr &Second) const override;

	void getNoopForMachoTarget(MCInst &NopInst) const override;

	bool
	reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;

	/// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
	/// instruction that defines the specified register class.
	bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;

	/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
	/// would clobber the EFLAGS condition register. Note the result may be
	/// conservative. If it cannot definitely determine the safety after visiting
	/// a few instructions in each direction it assumes it's not safe.
	bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const;

	/// True if MI has a condition code def, e.g. EFLAGS, that is
	/// not marked dead.
	bool hasLiveCondCodeDef(MachineInstr &MI) const;

	/// getGlobalBaseReg - Return a virtual register initialized with the
	/// the global base register value. Output instructions required to
	/// initialize the register in the function entry block, if necessary.
	///
	unsigned getGlobalBaseReg(MachineFunction *MF) const;

	std::pair<uint16_t, uint16_t>
	getExecutionDomain(const MachineInstr &MI) const override;

	void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;

	unsigned
	getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const override;
	unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
	const TargetRegisterInfo *TRI) const override;
	void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
	const TargetRegisterInfo *TRI) const override;

	MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
	unsigned OpNum,
	ArrayRef<MachineOperand> MOs,
	MachineBasicBlock::iterator InsertPt,
	unsigned Size, unsigned Alignment,
	bool AllowCommute) const;

	bool isHighLatencyDef(int opc) const override;

	bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
	const MachineRegisterInfo *MRI,
	const MachineInstr &DefMI, unsigned DefIdx,
	const MachineInstr &UseMI,
	unsigned UseIdx) const override;

	bool useMachineCombiner() const override {
	return true;
	}

	bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;

	bool hasReassociableOperands(const MachineInstr &Inst,
	const MachineBasicBlock *MBB) const override;

	void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
	MachineInstr &NewMI1,
	MachineInstr &NewMI2) const override;

	/// analyzeCompare - For a comparison instruction, return the source registers
	/// in SrcReg and SrcReg2 if having two register operands, and the value it
	/// compares against in CmpValue. Return true if the comparison instruction
	/// can be analyzed.
	bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
	unsigned &SrcReg2, int &CmpMask,
	int &CmpValue) const override;

	/// optimizeCompareInstr - Check if there exists an earlier instruction that
	/// operates on the same source operands and sets flags in the same way as
	/// Compare; remove Compare if possible.
	bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
	unsigned SrcReg2, int CmpMask, int CmpValue,
	const MachineRegisterInfo *MRI) const override;

	/// optimizeLoadInstr - Try to remove the load by folding it to a register
	/// operand at the use. We fold the load instructions if and only if the
	/// def and use are in the same BB. We only look at one load and see
	/// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
	/// defined by the load we are trying to fold. DefMI returns the machine
	/// instruction that defines FoldAsLoadDefReg, and the function returns
	/// the machine instruction generated due to folding.
	MachineInstr *optimizeLoadInstr(MachineInstr &MI,
	const MachineRegisterInfo *MRI,
	unsigned &FoldAsLoadDefReg,
	MachineInstr *&DefMI) const override;

	std::pair<unsigned, unsigned>
	decomposeMachineOperandsTargetFlags(unsigned TF) const override;

	ArrayRef<std::pair<unsigned, const char *>>
	getSerializableDirectMachineOperandTargetFlags() const override;

	bool isTailCall(const MachineInstr &Inst) const override;

	protected:
	/// Commutes the operands in the given instruction by changing the operands
	/// order and/or changing the instruction's opcode and/or the immediate value
	/// operand.
	///
	/// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
	/// to be commuted.
	///
	/// Do not call this method for a non-commutable instruction or
	/// non-commutable operands.
	/// Even though the instruction is commutable, the method may still
	/// fail to commute the operands, null pointer is returned in such cases.
	MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
	unsigned CommuteOpIdx1,
	unsigned CommuteOpIdx2) const override;

	private:
	MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
	MachineFunction::iterator &MFI,
	MachineInstr &MI,
	LiveVariables *LV) const;

	/// Handles memory folding for special case instructions, for instance those
	/// requiring custom manipulation of the address.
	MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
	unsigned OpNum,
	ArrayRef<MachineOperand> MOs,
	MachineBasicBlock::iterator InsertPt,
	unsigned Size, unsigned Align) const;

	/// isFrameOperand - Return true and the FrameIndex if the specified
	/// operand and follow operands form a reference to the stack frame.
	bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
	int &FrameIndex) const;

	/// Returns true iff the routine could find two commutable operands in the
	/// given machine instruction with 3 vector inputs.
	/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
	/// input values can be re-defined in this method only if the input values
	/// are not pre-defined, which is designated by the special value
	/// 'CommuteAnyOperandIndex' assigned to it.
	/// If both of indices are pre-defined and refer to some operands, then the
	/// method simply returns true if the corresponding operands are commutable
	/// and returns false otherwise.
	///
	/// For example, calling this method this way:
	/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
	/// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
	/// can be interpreted as a query asking to find an operand that would be
	/// commutable with the operand#1.
	bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
	unsigned &SrcOpIdx1,
	unsigned &SrcOpIdx2) const;
	};

	} // End llvm namespace

	#endif
	Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
	===================================================================
	--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 313642)
	+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 313643)
	@@ -1,1745 +1,1738 @@
	//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains code to lower X86 MachineInstrs to their corresponding
	// MCInst records.
	//
	//===----------------------------------------------------------------------===//

	#include "X86AsmPrinter.h"
	#include "X86RegisterInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "InstPrinter/X86ATTInstPrinter.h"
	#include "InstPrinter/X86InstComments.h"
	#include "MCTargetDesc/X86BaseInfo.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineModuleInfoImpls.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/Mangler.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCCodeEmitter.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCFixup.h"
	#include "llvm/MC/MCInst.h"
	#include "llvm/MC/MCInstBuilder.h"
	#include "llvm/MC/MCSection.h"
	#include "llvm/MC/MCStreamer.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/MC/MCSymbolELF.h"
	#include "llvm/MC/MCSectionELF.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/Support/TargetRegistry.h"
	#include "llvm/Support/ELF.h"
	#include "llvm/Target/TargetLoweringObjectFile.h"

	using namespace llvm;

	namespace {

	/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
	class X86MCInstLower {
	MCContext &Ctx;
	const MachineFunction &MF;
	const TargetMachine &TM;
	const MCAsmInfo &MAI;
	X86AsmPrinter &AsmPrinter;
	public:
	X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);

	Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const;
	void Lower(const MachineInstr *MI, MCInst &OutMI) const;

	MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
	MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;

	private:
	MachineModuleInfoMachO &getMachOMMI() const;
	};

	} // end anonymous namespace

	// Emit a minimal sequence of nops spanning NumBytes bytes.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI);

	void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
	const MCSubtargetInfo &STI,
	MCCodeEmitter *CodeEmitter) {
	if (InShadow) {
	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
	CurrentShadowSize += Code.size();
	if (CurrentShadowSize >= RequiredShadowSize)
	InShadow = false; // The shadow is big enough. Stop counting.
	}
	}

	void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
	MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
	if (InShadow && CurrentShadowSize < RequiredShadowSize) {
	InShadow = false;
	EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
	MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
	}
	}

	void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
	OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
	SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
	}

	X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
	X86AsmPrinter &asmprinter)
	: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
	AsmPrinter(asmprinter) {}

	MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
	return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
	}


	/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
	/// operand to an MCSymbol.
	MCSymbol *X86MCInstLower::
	GetSymbolFromOperand(const MachineOperand &MO) const {
	const DataLayout &DL = MF.getDataLayout();
	assert((MO.isGlobal() \|\| MO.isSymbol() \|\| MO.isMBB()) && "Isn't a symbol reference");

	MCSymbol *Sym = nullptr;
	SmallString<128> Name;
	StringRef Suffix;

	switch (MO.getTargetFlags()) {
	case X86II::MO_DLLIMPORT:
	// Handle dllimport linkage.
	Name += "__imp_";
	break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Suffix = "$non_lazy_ptr";
	break;
	}

	if (!Suffix.empty())
	Name += DL.getPrivateGlobalPrefix();

	if (MO.isGlobal()) {
	const GlobalValue *GV = MO.getGlobal();
	AsmPrinter.getNameWithPrefix(Name, GV);
	} else if (MO.isSymbol()) {
	Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
	} else if (MO.isMBB()) {
	assert(Suffix.empty());
	Sym = MO.getMBB()->getSymbol();
	}

	Name += Suffix;
	if (!Sym)
	Sym = Ctx.getOrCreateSymbol(Name);

	// If the target flags on the operand changes the name of the symbol, do that
	// before we return the symbol.
	switch (MO.getTargetFlags()) {
	default: break;
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
	MachineModuleInfoImpl::StubValueTy &StubSym =
	getMachOMMI().getGVStubEntry(Sym);
	if (!StubSym.getPointer()) {
	assert(MO.isGlobal() && "Extern symbol not handled yet");
	StubSym =
	MachineModuleInfoImpl::
	StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
	!MO.getGlobal()->hasInternalLinkage());
	}
	break;
	}
	}

	return Sym;
	}

	MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
	MCSymbol *Sym) const {
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	const MCExpr *Expr = nullptr;
	MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;

	switch (MO.getTargetFlags()) {
	default: llvm_unreachable("Unknown target flag on GV operand");
	case X86II::MO_NO_FLAG: // No flag.
	// These affect the name of the symbol, not any suffix.
	case X86II::MO_DARWIN_NONLAZY:
	case X86II::MO_DLLIMPORT:
	break;

	case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
	case X86II::MO_TLVP_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(Expr,
	MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
	Ctx),
	Ctx);
	break;
	case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
	case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
	case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
	case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
	case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
	case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
	case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
	case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
	case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
	case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
	case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
	case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
	case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
	case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
	case X86II::MO_PIC_BASE_OFFSET:
	case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
	Expr = MCSymbolRefExpr::create(Sym, Ctx);
	// Subtract the pic base.
	Expr = MCBinaryExpr::createSub(Expr,
	MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
	Ctx);
	if (MO.isJTI()) {
	assert(MAI.doesSetDirectiveSuppressReloc());
	// If .set directive is supported, use it to reduce the number of
	// relocations the assembler will generate for differences between
	// local labels. This is only safe when the symbols are in the same
	// section so we are restricting it to jumptable references.
	MCSymbol *Label = Ctx.createTempSymbol();
	AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
	Expr = MCSymbolRefExpr::create(Label, Ctx);
	}
	break;
	}

	if (!Expr)
	Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);

	if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
	Expr = MCBinaryExpr::createAdd(Expr,
	MCConstantExpr::create(MO.getOffset(), Ctx),
	Ctx);
	return MCOperand::createExpr(Expr);
	}


	/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
	/// a short fixed-register form.
	static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
	unsigned ImmOp = Inst.getNumOperands() - 1;
	assert(Inst.getOperand(0).isReg() &&
	(Inst.getOperand(ImmOp).isImm() \|\| Inst.getOperand(ImmOp).isExpr()) &&
	((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
	Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) \|\|
	Inst.getNumOperands() == 2) && "Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(0).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(ImmOp);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	}

	/// \brief If a movsx instruction has a shorter encoding for the used register
	/// simplify the instruction to use it instead.
	static void SimplifyMOVSX(MCInst &Inst) {
	unsigned NewOpcode = 0;
	unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
	switch (Inst.getOpcode()) {
	default:
	llvm_unreachable("Unexpected instruction!");
	case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
	if (Op0 == X86::AX && Op1 == X86::AL)
	NewOpcode = X86::CBW;
	break;
	case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
	if (Op0 == X86::EAX && Op1 == X86::AX)
	NewOpcode = X86::CWDE;
	break;
	case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
	if (Op0 == X86::RAX && Op1 == X86::EAX)
	NewOpcode = X86::CDQE;
	break;
	}

	if (NewOpcode != 0) {
	Inst = MCInst();
	Inst.setOpcode(NewOpcode);
	}
	}

	/// \brief Simplify things like MOV32rm to MOV32o32a.
	static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
	unsigned Opcode) {
	// Don't make these simplifications in 64-bit mode; other assemblers don't
	// perform them because they make the code larger.
	if (Printer.getSubtarget().is64Bit())
	return;

	bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
	unsigned AddrBase = IsStore;
	unsigned RegOp = IsStore ? 0 : 5;
	unsigned AddrOp = AddrBase + 3;
	assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
	Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
	Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
	(Inst.getOperand(AddrOp).isExpr() \|\|
	Inst.getOperand(AddrOp).isImm()) &&
	"Unexpected instruction!");

	// Check whether the destination register can be fixed.
	unsigned Reg = Inst.getOperand(RegOp).getReg();
	if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
	return;

	// Check whether this is an absolute address.
	// FIXME: We know TLVP symbol refs aren't, but there should be a better way
	// to do this here.
	bool Absolute = true;
	if (Inst.getOperand(AddrOp).isExpr()) {
	const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
	if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
	if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
	Absolute = false;
	}

	if (Absolute &&
	(Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 \|\|
	Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 \|\|
	Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
	return;

	// If so, rewrite the instruction.
	MCOperand Saved = Inst.getOperand(AddrOp);
	MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
	Inst = MCInst();
	Inst.setOpcode(Opcode);
	Inst.addOperand(Saved);
	Inst.addOperand(Seg);
	}

	static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
	return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
	}

	Optional<MCOperand>
	X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
	const MachineOperand &MO) const {
	switch (MO.getType()) {
	default:
	MI->dump();
	llvm_unreachable("unknown operand type");
	case MachineOperand::MO_Register:
	// Ignore all implicit register operands.
	if (MO.isImplicit())
	return None;
	return MCOperand::createReg(MO.getReg());
	case MachineOperand::MO_Immediate:
	return MCOperand::createImm(MO.getImm());
	case MachineOperand::MO_MachineBasicBlock:
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
	case MachineOperand::MO_MCSymbol:
	return LowerSymbolOperand(MO, MO.getMCSymbol());
	case MachineOperand::MO_JumpTableIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
	case MachineOperand::MO_ConstantPoolIndex:
	return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
	case MachineOperand::MO_BlockAddress:
	return LowerSymbolOperand(
	MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
	case MachineOperand::MO_RegisterMask:
	// Ignore call clobbers.
	return None;
	}
	}

	void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
	OutMI.setOpcode(MI->getOpcode());

	for (const MachineOperand &MO : MI->operands())
	if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
	OutMI.addOperand(MaybeMCOp.getValue());

	// Handle a few special cases to eliminate operand modifiers.
	ReSimplify:
	switch (OutMI.getOpcode()) {
	case X86::LEA64_32r:
	case X86::LEA64r:
	case X86::LEA16r:
	case X86::LEA32r:
	// LEA should have a segment register, but it must be empty.
	assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
	"Unexpected # of LEA operands");
	assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
	"LEA has segment specified!");
	break;

	// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
	// if one of the registers is extended, but other isn't.
	case X86::VMOVZPQILo2PQIrr:
	case X86::VMOVAPDrr:
	case X86::VMOVAPDYrr:
	case X86::VMOVAPSrr:
	case X86::VMOVAPSYrr:
	case X86::VMOVDQArr:
	case X86::VMOVDQAYrr:
	case X86::VMOVDQUrr:
	case X86::VMOVDQUYrr:
	case X86::VMOVUPDrr:
	case X86::VMOVUPDYrr:
	case X86::VMOVUPSrr:
	case X86::VMOVUPSYrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
	case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
	case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
	case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
	case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
	case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
	case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
	case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
	case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
	case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
	case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
	case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
	case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}
	case X86::VMOVSDrr:
	case X86::VMOVSSrr: {
	if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
	X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
	case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
	}
	OutMI.setOpcode(NewOpc);
	}
	break;
	}

	// TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
	// inputs modeled as normal uses instead of implicit uses. As such, truncate
	// off all but the first operand (the callee). FIXME: Change isel.
	case X86::TAILJMPr64:
	case X86::TAILJMPr64_REX:
	case X86::CALL64r:
	case X86::CALL64pcrel32: {
	unsigned Opcode = OutMI.getOpcode();
	MCOperand Saved = OutMI.getOperand(0);
	OutMI = MCInst();
	OutMI.setOpcode(Opcode);
	OutMI.addOperand(Saved);
	break;
	}

	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CLEANUPRET: {
	// Replace CATCHRET with the appropriate RET.
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
	break;
	}

	case X86::CATCHRET: {
	// Replace CATCHRET with the appropriate RET.
	const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
	unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	OutMI = MCInst();
	OutMI.setOpcode(getRetOpcode(Subtarget));
	OutMI.addOperand(MCOperand::createReg(ReturnReg));
	break;
	}

	- // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
	+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction.
	{ unsigned Opcode;
	case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
	case X86::TAILJMPd:
	case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
	- case X86::TAILJMPd_CC:
	- case X86::TAILJMPd64_CC:
	- Opcode = X86::GetCondBranchFromCond(
	- static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
	- goto SetTailJmpOpcode;

	SetTailJmpOpcode:
	MCOperand Saved = OutMI.getOperand(0);
	OutMI = MCInst();
	OutMI.setOpcode(Opcode);
	OutMI.addOperand(Saved);
	break;
	}

	case X86::DEC16r:
	case X86::DEC32r:
	case X86::INC16r:
	case X86::INC32r:
	// If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
	if (!AsmPrinter.getSubtarget().is64Bit()) {
	unsigned Opcode;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
	case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
	case X86::INC16r: Opcode = X86::INC16r_alt; break;
	case X86::INC32r: Opcode = X86::INC32r_alt; break;
	}
	OutMI.setOpcode(Opcode);
	}
	break;

	// These are pseudo-ops for OR to help with the OR->ADD transformation. We do
	// this with an ugly goto in case the resultant OR uses EAX and needs the
	// short form.
	case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
	case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
	case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
	case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
	case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
	case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
	case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
	case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
	case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;

	// Atomic load and store require a separate pseudo-inst because Acquire
	// implies mayStore and Release implies mayLoad; fix these to regular MOV
	// instructions here
	case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
	case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
	case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
	case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
	case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
	case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
	case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
	case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
	case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
	case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
	case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
	case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
	case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
	case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
	case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
	case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
	case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
	case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
	case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
	case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
	case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
	case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
	case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
	case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
	case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
	case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
	case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
	case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
	case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
	case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
	case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
	case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
	case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
	case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
	case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
	case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
	case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
	case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
	case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
	case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
	case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
	case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
	case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
	case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;

	// We don't currently select the correct instruction form for instructions
	// which have a short %eax, etc. form. Handle this by custom lowering, for
	// now.
	//
	// Note, we are currently not handling the following instructions:
	// MOV64ao8, MOV64o8a
	// XCHG16ar, XCHG32ar, XCHG64ar
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr:
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm:
	case X86::MOV16mr:
	case X86::MOV16rm:
	case X86::MOV32mr:
	case X86::MOV32rm: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::MOV8mr_NOREX:
	case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
	case X86::MOV8rm_NOREX:
	case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
	case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
	case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
	case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
	case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
	}
	SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
	break;
	}

	case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
	case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
	case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
	case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
	case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
	case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
	case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
	case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
	case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
	unsigned NewOpc;
	switch (OutMI.getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
	case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
	case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
	case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
	case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
	case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
	case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
	case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
	case X86::AND8ri: NewOpc = X86::AND8i8; break;
	case X86::AND16ri: NewOpc = X86::AND16i16; break;
	case X86::AND32ri: NewOpc = X86::AND32i32; break;
	case X86::AND64ri32: NewOpc = X86::AND64i32; break;
	case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
	case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
	case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
	case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
	case X86::OR8ri: NewOpc = X86::OR8i8; break;
	case X86::OR16ri: NewOpc = X86::OR16i16; break;
	case X86::OR32ri: NewOpc = X86::OR32i32; break;
	case X86::OR64ri32: NewOpc = X86::OR64i32; break;
	case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
	case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
	case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
	case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
	case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
	case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
	case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
	case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
	case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
	case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
	case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
	case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
	case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
	case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
	case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
	case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
	}
	SimplifyShortImmForm(OutMI, NewOpc);
	break;
	}

	// Try to shrink some forms of movsx.
	case X86::MOVSX16rr8:
	case X86::MOVSX32rr16:
	case X86::MOVSX64rr32:
	SimplifyMOVSX(OutMI);
	break;
	}
	}

	void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
	const MachineInstr &MI) {

	bool is64Bits = MI.getOpcode() == X86::TLS_addr64 \|\|
	MI.getOpcode() == X86::TLS_base_addr64;

	bool needsPadding = MI.getOpcode() == X86::TLS_addr64;

	MCContext &context = OutStreamer->getContext();

	if (needsPadding)
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));

	MCSymbolRefExpr::VariantKind SRVK;
	switch (MI.getOpcode()) {
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSGD;
	break;
	case X86::TLS_base_addr32:
	SRVK = MCSymbolRefExpr::VK_TLSLDM;
	break;
	case X86::TLS_base_addr64:
	SRVK = MCSymbolRefExpr::VK_TLSLD;
	break;
	default:
	llvm_unreachable("unexpected opcode");
	}

	MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
	const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);

	MCInst LEA;
	if (is64Bits) {
	LEA.setOpcode(X86::LEA64r);
	LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
	LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(0)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	} else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
	LEA.setOpcode(X86::LEA32r);
	LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
	LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(0)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	} else {
	LEA.setOpcode(X86::LEA32r);
	LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
	LEA.addOperand(MCOperand::createReg(0)); // base
	LEA.addOperand(MCOperand::createImm(1)); // scale
	LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
	LEA.addOperand(MCOperand::createExpr(symRef)); // disp
	LEA.addOperand(MCOperand::createReg(0)); // seg
	}
	EmitAndCountInstruction(LEA);

	if (needsPadding) {
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
	EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
	}

	StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
	MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
	const MCSymbolRefExpr *tlsRef =
	MCSymbolRefExpr::create(tlsGetAddr,
	MCSymbolRefExpr::VK_PLT,
	context);

	EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
	: X86::CALLpcrel32)
	.addExpr(tlsRef));
	}

	/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
	/// bytes. Return the size of nop emitted.
	static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	// This works only for 64bit. For 32bit we have to do additional checking if
	// the CPU supports multi-byte nops.
	assert(Is64Bit && "EmitNops only supports X86-64");

	unsigned NopSize;
	unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
	Opc = IndexReg = Displacement = SegmentReg = 0;
	BaseReg = X86::RAX;
	ScaleVal = 1;
	switch (NumBytes) {
	case 0: llvm_unreachable("Zero nops?"); break;
	case 1: NopSize = 1; Opc = X86::NOOP; break;
	case 2: NopSize = 2; Opc = X86::XCHG16ar; break;
	case 3: NopSize = 3; Opc = X86::NOOPL; break;
	case 4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
	case 5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
	IndexReg = X86::RAX; break;
	case 6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
	IndexReg = X86::RAX; break;
	case 7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
	case 8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
	IndexReg = X86::RAX; break;
	case 9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
	IndexReg = X86::RAX; break;
	default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
	IndexReg = X86::RAX; SegmentReg = X86::CS; break;
	}

	unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
	NopSize += NumPrefixes;
	for (unsigned i = 0; i != NumPrefixes; ++i)
	OS.EmitBytes("\x66");

	switch (Opc) {
	default:
	llvm_unreachable("Unexpected opcode");
	break;
	case X86::NOOP:
	OS.EmitInstruction(MCInstBuilder(Opc), STI);
	break;
	case X86::XCHG16ar:
	OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
	break;
	case X86::NOOPL:
	case X86::NOOPW:
	OS.EmitInstruction(MCInstBuilder(Opc)
	.addReg(BaseReg)
	.addImm(ScaleVal)
	.addReg(IndexReg)
	.addImm(Displacement)
	.addReg(SegmentReg),
	STI);
	break;
	}
	assert(NopSize <= NumBytes && "We overemitted?");
	return NopSize;
	}

	/// \brief Emit the optimal amount of multi-byte nops on X86.
	static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
	const MCSubtargetInfo &STI) {
	unsigned NopsToEmit = NumBytes;
	(void)NopsToEmit;
	while (NumBytes) {
	NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
	assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
	}
	}

	void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");

	StatepointOpers SOpers(&MI);
	if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
	EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	} else {
	// Lower call target and choose correct opcode
	const MachineOperand &CallTarget = SOpers.getCallTarget();
	MCOperand CallTargetMCOp;
	unsigned CallOpcode;
	switch (CallTarget.getType()) {
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	CallTargetMCOp = MCIL.LowerSymbolOperand(
	CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// address. You'll fail asserts during load & relocation if this
	// symbol is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Immediate:
	CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
	CallOpcode = X86::CALL64pcrel32;
	// Currently, we only support relative addressing with statepoints.
	// Otherwise, we'll need a scratch register to hold the target
	// immediate. You'll fail asserts during load & relocation if this
	// address is to far away. (TODO: support non-relative addressing)
	break;
	case MachineOperand::MO_Register:
	CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
	CallOpcode = X86::CALL64r;
	break;
	default:
	llvm_unreachable("Unsupported operand type in statepoint call target");
	break;
	}

	// Emit call
	MCInst CallInst;
	CallInst.setOpcode(CallOpcode);
	CallInst.addOperand(CallTargetMCOp);
	OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
	}

	// Record our statepoint node in the same section used by STACKMAP
	// and PATCHPOINT
	SM.recordStatepoint(MI);
	}

	void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>

	unsigned LoadDefRegister = MI.getOperand(0).getReg();
	MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
	unsigned LoadOpcode = MI.getOperand(2).getImm();
	unsigned LoadOperandsBeginIdx = 3;

	FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);

	MCInst LoadMI;
	LoadMI.setOpcode(LoadOpcode);

	if (LoadDefRegister != X86::NoRegister)
	LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));

	for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
	E = MI.operands_end();
	I != E; ++I)
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
	LoadMI.addOperand(MaybeOperand.getValue());

	OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
	}

	void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// PATCHABLE_OP minsize, opcode, operands

	unsigned MinSize = MI.getOperand(0).getImm();
	unsigned Opcode = MI.getOperand(1).getImm();

	MCInst MCI;
	MCI.setOpcode(Opcode);
	for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	MCI.addOperand(MaybeOperand.getValue());

	SmallString<256> Code;
	SmallVector<MCFixup, 4> Fixups;
	raw_svector_ostream VecOS(Code);
	CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());

	if (Code.size() < MinSize) {
	if (MinSize == 2 && Opcode == X86::PUSH64r) {
	// This is an optimization that lets us get away without emitting a nop in
	// many cases.
	//
	// NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
	// bytes too, so the check on MinSize is important.
	MCI.setOpcode(X86::PUSH64rmr);
	} else {
	unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
	getSubtargetInfo());
	assert(NopSize == MinSize && "Could not implement MinSize!");
	(void) NopSize;
	}
	}

	OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
	}

	// Lower a stackmap of the form:
	// <id>, <shadowBytes>, ...
	void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	SM.recordStackMap(MI);
	unsigned NumShadowBytes = MI.getOperand(1).getImm();
	SMShadowTracker.reset(NumShadowBytes);
	}

	// Lower a patchpoint of the form:
	// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
	void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");

	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());

	SM.recordPatchPoint(MI);

	PatchPointOpers opers(&MI);
	unsigned ScratchIdx = opers.getNextScratchIdx();
	unsigned EncodedBytes = 0;
	const MachineOperand &CalleeMO = opers.getCallTarget();

	// Check for null target. If target is non-null (i.e. is non-zero or is
	// symbolic) then emit a call.
	if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
	MCOperand CalleeMCOp;
	switch (CalleeMO.getType()) {
	default:
	/// FIXME: Add a verifier check for bad callee types.
	llvm_unreachable("Unrecognized callee operand type.");
	case MachineOperand::MO_Immediate:
	if (CalleeMO.getImm())
	CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
	break;
	case MachineOperand::MO_ExternalSymbol:
	case MachineOperand::MO_GlobalAddress:
	CalleeMCOp =
	MCIL.LowerSymbolOperand(CalleeMO,
	MCIL.GetSymbolFromOperand(CalleeMO));
	break;
	}

	// Emit MOV to materialize the target address and the CALL to target.
	// This is encoded with 12-13 bytes, depending on which register is used.
	unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
	if (X86II::isX86_64ExtendedReg(ScratchReg))
	EncodedBytes = 13;
	else
	EncodedBytes = 12;

	EmitAndCountInstruction(
	MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
	EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
	}

	// Emit padding.
	unsigned NumBytes = opers.getNumPatchBytes();
	assert(NumBytes >= EncodedBytes &&
	"Patchpoint can't request size less than the length of a call.");

	EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
	getSubtargetInfo());
	}

	void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// We want to emit the following pattern:
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// jmp .tmpN
	// # 9 bytes worth of noops
	// .tmpN
	//
	// We need the 9 bytes because at runtime, we'd be patching over the full 11
	// bytes with the following pattern:
	//
	// mov %r10, <function id, 32-bit> // 6 bytes
	// call <relative offset, 32-bits> // 5 bytes
	//
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	auto Target = OutContext.createTempSymbol();

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	OutStreamer->EmitLabel(Target);
	recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
	}

	void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
	X86MCInstLower &MCIL) {
	// Since PATCHABLE_RET takes the opcode of the return statement as an
	// argument, we use that to emit the correct form of the RET that we want.
	// i.e. when we see this:
	//
	// PATCHABLE_RET X86::RET ...
	//
	// We should emit the RET followed by sleds.
	//
	// .p2align 1, ...
	// .Lxray_sled_N:
	// ret # or equivalent instruction
	// # 10 bytes worth of noops
	//
	// This just makes sure that the alignment for the next instruction is 2.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	unsigned OpCode = MI.getOperand(0).getImm();
	MCInst Ret;
	Ret.setOpcode(OpCode);
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	Ret.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
	EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
	recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
	}

	void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
	// Like PATCHABLE_RET, we have the actual instruction in the operands to this
	// instruction so we lower that particular instruction and its operands.
	// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
	// we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
	// the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
	// tail call much like how we have it in PATCHABLE_RET.
	auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
	OutStreamer->EmitCodeAlignment(2);
	OutStreamer->EmitLabel(CurSled);
	auto Target = OutContext.createTempSymbol();

	// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
	// an operand (computed as an offset from the jmp instruction).
	// FIXME: Find another less hacky way do force the relative jump.
	OutStreamer->EmitBytes("\xeb\x09");
	EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
	OutStreamer->EmitLabel(Target);
	recordSled(CurSled, MI, SledKind::TAIL_CALL);

	unsigned OpCode = MI.getOperand(0).getImm();
	MCInst TC;
	TC.setOpcode(OpCode);

	// Before emitting the instruction, add a comment to indicate that this is
	// indeed a tail call.
	OutStreamer->AddComment("TAILCALL");
	for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
	if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
	TC.addOperand(MaybeOperand.getValue());
	OutStreamer->EmitInstruction(TC, getSubtargetInfo());
	}

	// Returns instruction preceding MBBI in MachineFunction.
	// If MBBI is the first instruction of the first basic block, returns null.
	static MachineBasicBlock::const_iterator
	PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
	const MachineBasicBlock *MBB = MBBI->getParent();
	while (MBBI == MBB->begin()) {
	if (MBB == &MBB->getParent()->front())
	return MachineBasicBlock::const_iterator();
	MBB = MBB->getPrevNode();
	MBBI = MBB->end();
	}
	return --MBBI;
	}

	static const Constant *getConstantFromPool(const MachineInstr &MI,
	const MachineOperand &Op) {
	if (!Op.isCPI())
	return nullptr;

	ArrayRef<MachineConstantPoolEntry> Constants =
	MI.getParent()->getParent()->getConstantPool()->getConstants();
	const MachineConstantPoolEntry &ConstantEntry =
	Constants[Op.getIndex()];

	// Bail if this is a machine constant pool entry, we won't be able to dig out
	// anything useful.
	if (ConstantEntry.isMachineConstantPoolEntry())
	return nullptr;

	auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
	assert((!C \|\| ConstantEntry.getType() == C->getType()) &&
	"Expected a constant of the same type!");
	return C;
	}

	static std::string getShuffleComment(const MachineInstr *MI,
	unsigned SrcOp1Idx,
	unsigned SrcOp2Idx,
	ArrayRef<int> Mask) {
	std::string Comment;

	// Compute the name for a register. This is really goofy because we have
	// multiple instruction printers that could (in theory) use different
	// names. Fortunately most people use the ATT style (outside of Windows)
	// and they actually agree on register naming here. Ultimately, this is
	// a comment, and so its OK if it isn't perfect.
	auto GetRegisterName = [](unsigned RegNum) -> StringRef {
	return X86ATTInstPrinter::getRegisterName(RegNum);
	};

	const MachineOperand &DstOp = MI->getOperand(0);
	const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
	const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);

	StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
	StringRef Src1Name =
	SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
	StringRef Src2Name =
	SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";

	// One source operand, fix the mask to print all elements in one span.
	SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
	if (Src1Name == Src2Name)
	for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
	if (ShuffleMask[i] >= e)
	ShuffleMask[i] -= e;

	raw_string_ostream CS(Comment);
	CS << DstName;

	// Handle AVX512 MASK/MASXZ write mask comments.
	// MASK: zmmX {%kY}
	// MASKZ: zmmX {%kY} {z}
	if (SrcOp1Idx > 1) {
	assert((SrcOp1Idx == 2 \|\| SrcOp1Idx == 3) && "Unexpected writemask");

	const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
	if (WriteMaskOp.isReg()) {
	CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";

	if (SrcOp1Idx == 2) {
	CS << " {z}";
	}
	}
	}

	CS << " = ";

	for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
	if (i != 0)
	CS << ",";
	if (ShuffleMask[i] == SM_SentinelZero) {
	CS << "zero";
	continue;
	}

	// Otherwise, it must come from src1 or src2. Print the span of elements
	// that comes from this src.
	bool isSrc1 = ShuffleMask[i] < (int)e;
	CS << (isSrc1 ? Src1Name : Src2Name) << '[';

	bool IsFirst = true;
	while (i != e && ShuffleMask[i] != SM_SentinelZero &&
	(ShuffleMask[i] < (int)e) == isSrc1) {
	if (!IsFirst)
	CS << ',';
	else
	IsFirst = false;
	if (ShuffleMask[i] == SM_SentinelUndef)
	CS << "u";
	else
	CS << ShuffleMask[i] % (int)e;
	++i;
	}
	CS << ']';
	--i; // For loop increments element #.
	}
	CS.flush();

	return Comment;
	}

	void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
	X86MCInstLower MCInstLowering(MF, this);
	const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();

	// Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
	// are compressed from EVEX encoding to VEX encoding.
	if (TM.Options.MCOptions.ShowMCEncoding) {
	if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
	OutStreamer->AddComment("EVEX TO VEX Compression ", false);
	}

	switch (MI->getOpcode()) {
	case TargetOpcode::DBG_VALUE:
	llvm_unreachable("Should be handled target independently");

	// Emit nothing here but a comment if we can.
	case X86::Int_MemBarrier:
	OutStreamer->emitRawComment("MEMBARRIER");
	return;


	case X86::EH_RETURN:
	case X86::EH_RETURN64: {
	// Lower these as normal, but add some comments.
	unsigned Reg = MI->getOperand(0).getReg();
	OutStreamer->AddComment(StringRef("eh_return, addr: %") +
	X86ATTInstPrinter::getRegisterName(Reg));
	break;
	}
	case X86::CLEANUPRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CLEANUPRET");
	break;
	}

	case X86::CATCHRET: {
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("CATCHRET");
	break;
	}

	case X86::TAILJMPr:
	case X86::TAILJMPm:
	case X86::TAILJMPd:
	- case X86::TAILJMPd_CC:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPd64:
	- case X86::TAILJMPd64_CC:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	// Lower these as normal, but add some comments.
	OutStreamer->AddComment("TAILCALL");
	break;

	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return LowerTlsAddr(MCInstLowering, *MI);

	case X86::MOVPC32r: {
	// This is a pseudo op for a two instruction sequence with a label, which
	// looks like:
	// call "L1$pb"
	// "L1$pb":
	// popl %esi

	// Emit the call.
	MCSymbol *PICBase = MF->getPICBaseSymbol();
	// FIXME: We would like an efficient form for this, so we don't have to do a
	// lot of extra uniquing.
	EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
	.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));

	const X86FrameLowering* FrameLowering =
	MF->getSubtarget<X86Subtarget>().getFrameLowering();
	bool hasFP = FrameLowering->hasFP(*MF);

	// TODO: This is needed only if we require precise CFA.
	bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
	!OutStreamer->getDwarfFrameInfos().back().End;

	int stackGrowth = -RI->getSlotSize();

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
	}

	// Emit the label.
	OutStreamer->EmitLabel(PICBase);

	// popl $reg
	EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
	.addReg(MI->getOperand(0).getReg()));

	if (HasActiveDwarfFrame && !hasFP) {
	OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
	}
	return;
	}

	case X86::ADD32ri: {
	// Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
	if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
	break;

	// Okay, we have something like:
	// EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)

	// For this, we want to print something like:
	// MYGLOBAL + (. - PICBASE)
	// However, we can't generate a ".", so just emit a new label here and refer
	// to it.
	MCSymbol *DotSym = OutContext.createTempSymbol();
	OutStreamer->EmitLabel(DotSym);

	// Now that we have emitted the label, lower the complex operand expression.
	MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));

	const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
	const MCExpr *PICBase =
	MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
	DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);

	DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
	DotExpr, OutContext);

	EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
	.addReg(MI->getOperand(0).getReg())
	.addReg(MI->getOperand(1).getReg())
	.addExpr(DotExpr));
	return;
	}
	case TargetOpcode::STATEPOINT:
	return LowerSTATEPOINT(*MI, MCInstLowering);

	case TargetOpcode::FAULTING_LOAD_OP:
	return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_OP:
	return LowerPATCHABLE_OP(*MI, MCInstLowering);

	case TargetOpcode::STACKMAP:
	return LowerSTACKMAP(*MI);

	case TargetOpcode::PATCHPOINT:
	return LowerPATCHPOINT(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
	return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_RET:
	return LowerPATCHABLE_RET(*MI, MCInstLowering);

	case TargetOpcode::PATCHABLE_TAIL_CALL:
	return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);

	case X86::MORESTACK_RET:
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	return;

	case X86::MORESTACK_RET_RESTORE_R10:
	// Return, then restore R10.
	EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
	EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
	.addReg(X86::R10)
	.addReg(X86::RAX));
	return;

	case X86::SEH_PushReg:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
	return;

	case X86::SEH_SaveReg:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	return;

	case X86::SEH_SaveXMM:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	return;

	case X86::SEH_StackAlloc:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
	return;

	case X86::SEH_SetFrame:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
	MI->getOperand(1).getImm());
	return;

	case X86::SEH_PushFrame:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
	return;

	case X86::SEH_EndPrologue:
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	OutStreamer->EmitWinCFIEndProlog();
	return;

	case X86::SEH_Epilogue: {
	assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
	MachineBasicBlock::const_iterator MBBI(MI);
	// Check if preceded by a call and emit nop if so.
	for (MBBI = PrevCrossBBInst(MBBI);
	MBBI != MachineBasicBlock::const_iterator();
	MBBI = PrevCrossBBInst(MBBI)) {
	// Conservatively assume that pseudo instructions don't emit code and keep
	// looking for a call. We may emit an unnecessary nop in some cases.
	if (!MBBI->isPseudo()) {
	if (MBBI->isCall())
	EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
	break;
	}
	}
	return;
	}

	// Lower PSHUFB and VPERMILP normally but add a comment if we can find
	// a constant shuffle mask. We won't be able to do this at the MC layer
	// because the mask isn't an immediate.
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrm:
	case X86::VPSHUFBZrmk:
	case X86::VPSHUFBZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::PSHUFBrm:
	case X86::VPSHUFBrm:
	case X86::VPSHUFBYrm:
	case X86::VPSHUFBZ128rm:
	case X86::VPSHUFBZ256rm:
	case X86::VPSHUFBZrm:
	SrcIdx = 1; MaskIdx = 5; break;
	case X86::VPSHUFBZ128rmkz:
	case X86::VPSHUFBZ256rmkz:
	case X86::VPSHUFBZrmkz:
	SrcIdx = 2; MaskIdx = 6; break;
	case X86::VPSHUFBZ128rmk:
	case X86::VPSHUFBZ256rmk:
	case X86::VPSHUFBZrmk:
	SrcIdx = 3; MaskIdx = 7; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 64> Mask;
	DecodePSHUFBMask(C, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
	}
	break;
	}

	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrm:
	case X86::VPERMILPSZrmk:
	case X86::VPERMILPSZrmkz:
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrm:
	case X86::VPERMILPDZrmk:
	case X86::VPERMILPDZrmkz: {
	if (!OutStreamer->isVerboseAsm())
	break;
	unsigned SrcIdx, MaskIdx;
	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMILPSrm:
	case X86::VPERMILPSYrm:
	case X86::VPERMILPSZ128rm:
	case X86::VPERMILPSZ256rm:
	case X86::VPERMILPSZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
	case X86::VPERMILPSZ128rmkz:
	case X86::VPERMILPSZ256rmkz:
	case X86::VPERMILPSZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
	case X86::VPERMILPSZ128rmk:
	case X86::VPERMILPSZ256rmk:
	case X86::VPERMILPSZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
	case X86::VPERMILPDrm:
	case X86::VPERMILPDYrm:
	case X86::VPERMILPDZ128rm:
	case X86::VPERMILPDZ256rm:
	case X86::VPERMILPDZrm:
	SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
	case X86::VPERMILPDZ128rmkz:
	case X86::VPERMILPDZ256rmkz:
	case X86::VPERMILPDZrmkz:
	SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
	case X86::VPERMILPDZ128rmk:
	case X86::VPERMILPDZ256rmk:
	case X86::VPERMILPDZrmk:
	SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
	}

	assert(MI->getNumOperands() >= 6 &&
	"We should always have at least 6 operands!");

	const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPERMILPMask(C, ElSize, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
	}
	break;
	}

	case X86::VPERMIL2PDrm:
	case X86::VPERMIL2PSrm:
	case X86::VPERMIL2PDrmY:
	case X86::VPERMIL2PSrmY: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 8 &&
	"We should always have at least 8 operands!");

	const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
	if (!CtrlOp.isImm())
	break;

	unsigned ElSize;
	switch (MI->getOpcode()) {
	default: llvm_unreachable("Invalid opcode");
	case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
	case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
	}

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
	}
	break;
	}

	case X86::VPPERMrrm: {
	if (!OutStreamer->isVerboseAsm())
	break;
	assert(MI->getNumOperands() >= 7 &&
	"We should always have at least 7 operands!");

	const MachineOperand &MaskOp = MI->getOperand(6);
	if (auto C = getConstantFromPool(MI, MaskOp)) {
	SmallVector<int, 16> Mask;
	DecodeVPPERMMask(C, Mask);
	if (!Mask.empty())
	OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
	}
	break;
	}

	#define MOV_CASE(Prefix, Suffix) \
	case X86::Prefix##MOVAPD##Suffix##rm: \
	case X86::Prefix##MOVAPS##Suffix##rm: \
	case X86::Prefix##MOVUPD##Suffix##rm: \
	case X86::Prefix##MOVUPS##Suffix##rm: \
	case X86::Prefix##MOVDQA##Suffix##rm: \
	case X86::Prefix##MOVDQU##Suffix##rm:

	#define MOV_AVX512_CASE(Suffix) \
	case X86::VMOVDQA64##Suffix##rm: \
	case X86::VMOVDQA32##Suffix##rm: \
	case X86::VMOVDQU64##Suffix##rm: \
	case X86::VMOVDQU32##Suffix##rm: \
	case X86::VMOVDQU16##Suffix##rm: \
	case X86::VMOVDQU8##Suffix##rm: \
	case X86::VMOVAPS##Suffix##rm: \
	case X86::VMOVAPD##Suffix##rm: \
	case X86::VMOVUPS##Suffix##rm: \
	case X86::VMOVUPD##Suffix##rm:

	#define CASE_ALL_MOV_RM() \
	MOV_CASE(, ) /* SSE */ \
	MOV_CASE(V, ) /* AVX-128 */ \
	MOV_CASE(V, Y) /* AVX-256 */ \
	MOV_AVX512_CASE(Z) \
	MOV_AVX512_CASE(Z256) \
	MOV_AVX512_CASE(Z128)

	// For loads from a constant pool to a vector register, print the constant
	// loaded.
	CASE_ALL_MOV_RM()
	if (!OutStreamer->isVerboseAsm())
	break;
	if (MI->getNumOperands() <= 4)
	break;
	if (auto C = getConstantFromPool(MI, MI->getOperand(4))) {
	std::string Comment;
	raw_string_ostream CS(Comment);
	const MachineOperand &DstOp = MI->getOperand(0);
	CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
	if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
	CS << "[";
	for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
	if (i != 0)
	CS << ",";
	if (CDS->getElementType()->isIntegerTy())
	CS << CDS->getElementAsInteger(i);
	else if (CDS->getElementType()->isFloatTy())
	CS << CDS->getElementAsFloat(i);
	else if (CDS->getElementType()->isDoubleTy())
	CS << CDS->getElementAsDouble(i);
	else
	CS << "?";
	}
	CS << "]";
	OutStreamer->AddComment(CS.str());
	} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
	CS << "<";
	for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
	if (i != 0)
	CS << ",";
	Constant *COp = CV->getOperand(i);
	if (isa<UndefValue>(COp)) {
	CS << "u";
	} else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
	if (CI->getBitWidth() <= 64) {
	CS << CI->getZExtValue();
	} else {
	// print multi-word constant as (w0,w1)
	const auto &Val = CI->getValue();
	CS << "(";
	for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
	if (i > 0)
	CS << ",";
	CS << Val.getRawData()[i];
	}
	CS << ")";
	}
	} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
	SmallString<32> Str;
	CF->getValueAPF().toString(Str);
	CS << Str;
	} else {
	CS << "?";
	}
	}
	CS << ">";
	OutStreamer->AddComment(CS.str());
	}
	}
	break;
	}

	MCInst TmpInst;
	MCInstLowering.Lower(MI, TmpInst);

	// Stackmap shadows cannot include branch targets, so we can count the bytes
	// in a call towards the shadow, but must ensure that the no thread returns
	// in to the stackmap shadow. The only way to achieve this is if the call
	// is at the end of the shadow.
	if (MI->isCall()) {
	// Count then size of the call towards the shadow
	SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
	// Then flush the shadow so that we fill with nops before the call, not
	// after it.
	SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
	// Then emit the call
	OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
	return;
	}

	EmitAndCountInstruction(TmpInst);
	}
	Index: projects/clang400-import/contrib/llvm/tools/clang
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/clang (revision 313642)
	+++ projects/clang400-import/contrib/llvm/tools/clang (revision 313643)

	Property changes on: projects/clang400-import/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist:r313300-313642
	Index: projects/clang400-import/contrib/llvm/tools/lld
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/lld (revision 313642)
	+++ projects/clang400-import/contrib/llvm/tools/lld (revision 313643)

	Property changes on: projects/clang400-import/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist:r313300-313642
	Index: projects/clang400-import/contrib/llvm/tools/lldb
	===================================================================
	--- projects/clang400-import/contrib/llvm/tools/lldb (revision 313642)
	+++ projects/clang400-import/contrib/llvm/tools/lldb (revision 313643)

	Property changes on: projects/clang400-import/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist:r313300-313642
	Index: projects/clang400-import/contrib/llvm
	===================================================================
	--- projects/clang400-import/contrib/llvm (revision 313642)
	+++ projects/clang400-import/contrib/llvm (revision 313643)

	Property changes on: projects/clang400-import/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist:r313300-313642
	Index: projects/clang400-import/lib/clang/include/clang/Basic/Version.inc
	===================================================================
	--- projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 313642)
	+++ projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 313643)
	@@ -1,11 +1,11 @@
	/* $FreeBSD$ */

	#define CLANG_VERSION 4.0.0
	#define CLANG_VERSION_STRING "4.0.0"
	#define CLANG_VERSION_MAJOR 4
	#define CLANG_VERSION_MINOR 0
	#define CLANG_VERSION_PATCHLEVEL 0

	#define CLANG_VENDOR "FreeBSD "

	-#define SVN_REVISION "294123"
	+#define SVN_REVISION "294803"
	Index: projects/clang400-import/lib/clang/include/lld/Config/Version.inc
	===================================================================
	--- projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 313642)
	+++ projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 313643)
	@@ -1,8 +1,8 @@
	// $FreeBSD$

	#define LLD_VERSION 4.0.0
	#define LLD_VERSION_STRING "4.0.0"
	#define LLD_VERSION_MAJOR 4
	#define LLD_VERSION_MINOR 0
	-#define LLD_REVISION_STRING "294123"
	+#define LLD_REVISION_STRING "294803"
	#define LLD_REPOSITORY_STRING "FreeBSD"

File Metadata

Mime Type: text/x-c++
Expires: Thu, Mar 19, 6:31 AM (2 d)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: 39/fa/4da7c987d8c412f61443efbad14e
Default Alt Text: (1 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions