Page MenuHomeFreeBSD

No OneTemporary

This file is larger than 256 KB, so syntax highlighting was skipped.
Index: projects/clang400-import/contrib/compiler-rt
===================================================================
--- projects/clang400-import/contrib/compiler-rt (revision 313642)
+++ projects/clang400-import/contrib/compiler-rt (revision 313643)
Property changes on: projects/clang400-import/contrib/compiler-rt
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/compiler-rt/dist:r313300-313642
Index: projects/clang400-import/contrib/libc++/include/optional
===================================================================
--- projects/clang400-import/contrib/libc++/include/optional (revision 313642)
+++ projects/clang400-import/contrib/libc++/include/optional (revision 313643)
@@ -1,1314 +1,1312 @@
// -*- C++ -*-
//===-------------------------- optional ----------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP_OPTIONAL
#define _LIBCPP_OPTIONAL
/*
optional synopsis
// C++1z
namespace std {
// 20.6.3, optional for object types
template <class T> class optional;
// 20.6.4, no-value state indicator
struct nullopt_t{see below };
constexpr nullopt_t nullopt(unspecified );
// 20.6.5, class bad_optional_access
class bad_optional_access;
// 20.6.6, relational operators
template <class T>
constexpr bool operator==(const optional<T>&, const optional<T>&);
template <class T>
constexpr bool operator!=(const optional<T>&, const optional<T>&);
template <class T>
constexpr bool operator<(const optional<T>&, const optional<T>&);
template <class T>
constexpr bool operator>(const optional<T>&, const optional<T>&);
template <class T>
constexpr bool operator<=(const optional<T>&, const optional<T>&);
template <class T>
constexpr bool operator>=(const optional<T>&, const optional<T>&);
template <class T> constexpr bool operator==(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator==(nullopt_t, const optional<T>&) noexcept;
template <class T> constexpr bool operator!=(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator!=(nullopt_t, const optional<T>&) noexcept;
template <class T> constexpr bool operator<(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator<(nullopt_t, const optional<T>&) noexcept;
template <class T> constexpr bool operator<=(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator<=(nullopt_t, const optional<T>&) noexcept;
template <class T> constexpr bool operator>(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator>(nullopt_t, const optional<T>&) noexcept;
template <class T> constexpr bool operator>=(const optional<T>&, nullopt_t) noexcept;
template <class T> constexpr bool operator>=(nullopt_t, const optional<T>&) noexcept;
// 20.6.8, comparison with T
template <class T> constexpr bool operator==(const optional<T>&, const T&);
template <class T> constexpr bool operator==(const T&, const optional<T>&);
template <class T> constexpr bool operator!=(const optional<T>&, const T&);
template <class T> constexpr bool operator!=(const T&, const optional<T>&);
template <class T> constexpr bool operator<(const optional<T>&, const T&);
template <class T> constexpr bool operator<(const T&, const optional<T>&);
template <class T> constexpr bool operator<=(const optional<T>&, const T&);
template <class T> constexpr bool operator<=(const T&, const optional<T>&);
template <class T> constexpr bool operator>(const optional<T>&, const T&);
template <class T> constexpr bool operator>(const T&, const optional<T>&);
template <class T> constexpr bool operator>=(const optional<T>&, const T&);
template <class T> constexpr bool operator>=(const T&, const optional<T>&);
// 20.6.9, specialized algorithms
template <class T> void swap(optional<T>&, optional<T>&) noexcept(see below );
template <class T> constexpr optional<see below > make_optional(T&&);
template <class T, class... Args>
constexpr optional<T> make_optional(Args&&... args);
template <class T, class U, class... Args>
constexpr optional<T> make_optional(initializer_list<U> il, Args&&... args);
// 20.6.10, hash support
template <class T> struct hash;
template <class T> struct hash<optional<T>>;
template <class T> class optional {
public:
using value_type = T;
// 20.6.3.1, constructors
constexpr optional() noexcept;
constexpr optional(nullopt_t) noexcept;
optional(const optional &);
optional(optional &&) noexcept(see below );
template <class... Args> constexpr explicit optional(in_place_t, Args &&...);
template <class U, class... Args>
constexpr explicit optional(in_place_t, initializer_list<U>, Args &&...);
template <class U = T>
constexpr EXPLICIT optional(U &&);
template <class U>
constexpr EXPLICIT optional(const optional<U> &);
template <class U>
constexpr EXPLICIT optional(optional<U> &&);
// 20.6.3.2, destructor
~optional();
// 20.6.3.3, assignment
optional &operator=(nullopt_t) noexcept;
optional &operator=(const optional &);
optional &operator=(optional &&) noexcept(see below );
template <class U = T> optional &operator=(U &&);
template <class U> optional &operator=(const optional<U> &);
template <class U> optional &operator=(optional<U> &&);
template <class... Args> void emplace(Args &&...);
template <class U, class... Args>
void emplace(initializer_list<U>, Args &&...);
// 20.6.3.4, swap
void swap(optional &) noexcept(see below );
// 20.6.3.5, observers
constexpr T const *operator->() const;
constexpr T *operator->();
constexpr T const &operator*() const &;
constexpr T &operator*() &;
constexpr T &&operator*() &&;
constexpr const T &&operator*() const &&;
constexpr explicit operator bool() const noexcept;
constexpr bool has_value() const noexcept;
constexpr T const &value() const &;
constexpr T &value() &;
constexpr T &&value() &&;
constexpr const T &&value() const &&;
template <class U> constexpr T value_or(U &&) const &;
template <class U> constexpr T value_or(U &&) &&;
// 20.6.3.6, modifiers
void reset() noexcept;
private:
T *val; // exposition only
};
} // namespace std
*/
#include <__config>
#include <__debug>
#include <__functional_base>
#include <__undef_min_max>
#include <functional>
#include <initializer_list>
#include <new>
#include <stdexcept>
#include <type_traits>
#include <utility>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
#pragma GCC system_header
#endif
namespace std // purposefully not using versioning namespace
{
class _LIBCPP_EXCEPTION_ABI bad_optional_access
- : public logic_error
+ : public exception
{
public:
- _LIBCPP_INLINE_VISIBILITY
- bad_optional_access() : logic_error("bad optional access") {}
-
// Get the key function ~bad_optional_access() into the dylib
virtual ~bad_optional_access() _NOEXCEPT;
+ virtual const char* what() const _NOEXCEPT;
};
} // std
#if _LIBCPP_STD_VER > 14
_LIBCPP_BEGIN_NAMESPACE_STD
_LIBCPP_NORETURN
inline _LIBCPP_INLINE_VISIBILITY
void __throw_bad_optional_access() {
#ifndef _LIBCPP_NO_EXCEPTIONS
throw bad_optional_access();
#else
_VSTD::abort();
#endif
}
struct nullopt_t
{
struct __secret_tag { _LIBCPP_INLINE_VISIBILITY explicit __secret_tag() = default; };
_LIBCPP_INLINE_VISIBILITY constexpr explicit nullopt_t(__secret_tag, __secret_tag) noexcept {}
};
/* inline */ constexpr nullopt_t nullopt{nullopt_t::__secret_tag{}, nullopt_t::__secret_tag{}};
template <class _Tp, bool = is_trivially_destructible<_Tp>::value>
struct __optional_destruct_base;
template <class _Tp>
struct __optional_destruct_base<_Tp, false>
{
typedef _Tp value_type;
static_assert(is_object_v<value_type>,
"instantiation of optional with a non-object type is undefined behavior");
union
{
char __null_state_;
value_type __val_;
};
bool __engaged_;
_LIBCPP_INLINE_VISIBILITY
~__optional_destruct_base()
{
if (__engaged_)
__val_.~value_type();
}
_LIBCPP_INLINE_VISIBILITY
constexpr __optional_destruct_base() noexcept
: __null_state_(),
__engaged_(false) {}
template <class... _Args>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
: __val_(_VSTD::forward<_Args>(__args)...),
__engaged_(true) {}
_LIBCPP_INLINE_VISIBILITY
void reset() noexcept
{
if (__engaged_)
{
__val_.~value_type();
__engaged_ = false;
}
}
};
template <class _Tp>
struct __optional_destruct_base<_Tp, true>
{
typedef _Tp value_type;
static_assert(is_object_v<value_type>,
"instantiation of optional with a non-object type is undefined behavior");
union
{
char __null_state_;
value_type __val_;
};
bool __engaged_;
_LIBCPP_INLINE_VISIBILITY
constexpr __optional_destruct_base() noexcept
: __null_state_(),
__engaged_(false) {}
template <class... _Args>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit __optional_destruct_base(in_place_t, _Args&&... __args)
: __val_(_VSTD::forward<_Args>(__args)...),
__engaged_(true) {}
_LIBCPP_INLINE_VISIBILITY
void reset() noexcept
{
if (__engaged_)
{
__engaged_ = false;
}
}
};
template <class _Tp, bool = is_reference<_Tp>::value>
struct __optional_storage_base : __optional_destruct_base<_Tp>
{
using __base = __optional_destruct_base<_Tp>;
using value_type = _Tp;
using __base::__base;
_LIBCPP_INLINE_VISIBILITY
constexpr bool has_value() const noexcept
{
return this->__engaged_;
}
_LIBCPP_INLINE_VISIBILITY
constexpr value_type& __get() & noexcept
{
return this->__val_;
}
_LIBCPP_INLINE_VISIBILITY
constexpr const value_type& __get() const& noexcept
{
return this->__val_;
}
_LIBCPP_INLINE_VISIBILITY
constexpr value_type&& __get() && noexcept
{
return _VSTD::move(this->__val_);
}
_LIBCPP_INLINE_VISIBILITY
constexpr const value_type&& __get() const&& noexcept
{
return _VSTD::move(this->__val_);
}
template <class... _Args>
_LIBCPP_INLINE_VISIBILITY
void __construct(_Args&&... __args)
{
_LIBCPP_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
::new((void*)_VSTD::addressof(this->__val_)) value_type(_VSTD::forward<_Args>(__args)...);
this->__engaged_ = true;
}
template <class _That>
_LIBCPP_INLINE_VISIBILITY
void __construct_from(_That&& __opt)
{
if (__opt.has_value())
__construct(_VSTD::forward<_That>(__opt).__get());
}
template <class _That>
_LIBCPP_INLINE_VISIBILITY
void __assign_from(_That&& __opt)
{
if (this->__engaged_ == __opt.has_value())
{
if (this->__engaged_)
this->__val_ = _VSTD::forward<_That>(__opt).__get();
}
else
{
if (this->__engaged_)
this->reset();
else
__construct(_VSTD::forward<_That>(__opt).__get());
}
}
};
// optional<T&> is currently required ill-formed, however it may to be in the
// future. For this reason it has already been implemented to ensure we can
// make the change in an ABI compatible manner.
template <class _Tp>
struct __optional_storage_base<_Tp, true>
{
using value_type = _Tp;
using __raw_type = remove_reference_t<_Tp>;
__raw_type* __value_;
template <class _Up>
static constexpr bool __can_bind_reference() {
using _RawUp = typename remove_reference<_Up>::type;
using _UpPtr = _RawUp*;
using _RawTp = typename remove_reference<_Tp>::type;
using _TpPtr = _RawTp*;
using _CheckLValueArg = integral_constant<bool,
(is_lvalue_reference<_Up>::value && is_convertible<_UpPtr, _TpPtr>::value)
|| is_same<_RawUp, reference_wrapper<_RawTp>>::value
|| is_same<_RawUp, reference_wrapper<typename remove_const<_RawTp>::type>>::value
>;
return (is_lvalue_reference<_Tp>::value && _CheckLValueArg::value)
|| (is_rvalue_reference<_Tp>::value && !is_lvalue_reference<_Up>::value &&
is_convertible<_UpPtr, _TpPtr>::value);
}
_LIBCPP_INLINE_VISIBILITY
constexpr __optional_storage_base() noexcept
: __value_(nullptr) {}
template <class _UArg>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit __optional_storage_base(in_place_t, _UArg&& __uarg)
: __value_(_VSTD::addressof(__uarg))
{
static_assert(__can_bind_reference<_UArg>(),
"Attempted to construct a reference element in tuple from a "
"possible temporary");
}
_LIBCPP_INLINE_VISIBILITY
void reset() noexcept { __value_ = nullptr; }
_LIBCPP_INLINE_VISIBILITY
constexpr bool has_value() const noexcept
{ return __value_ != nullptr; }
_LIBCPP_INLINE_VISIBILITY
constexpr value_type& __get() const& noexcept
{ return *__value_; }
_LIBCPP_INLINE_VISIBILITY
constexpr value_type&& __get() const&& noexcept
{ return _VSTD::forward<value_type>(*__value_); }
template <class _UArg>
_LIBCPP_INLINE_VISIBILITY
void __construct(_UArg&& __val)
{
_LIBCPP_ASSERT(!has_value(), "__construct called for engaged __optional_storage");
static_assert(__can_bind_reference<_UArg>(),
"Attempted to construct a reference element in tuple from a "
"possible temporary");
__value_ = _VSTD::addressof(__val);
}
template <class _That>
_LIBCPP_INLINE_VISIBILITY
void __construct_from(_That&& __opt)
{
if (__opt.has_value())
__construct(_VSTD::forward<_That>(__opt).__get());
}
template <class _That>
_LIBCPP_INLINE_VISIBILITY
void __assign_from(_That&& __opt)
{
if (has_value() == __opt.has_value())
{
if (has_value())
*__value_ = _VSTD::forward<_That>(__opt).__get();
}
else
{
if (has_value())
reset();
else
__construct(_VSTD::forward<_That>(__opt).__get());
}
}
};
template <class _Tp, bool = is_trivially_copyable<_Tp>::value>
struct __optional_storage;
template <class _Tp>
struct __optional_storage<_Tp, true> : __optional_storage_base<_Tp>
{
using __optional_storage_base<_Tp>::__optional_storage_base;
};
template <class _Tp>
struct __optional_storage<_Tp, false> : __optional_storage_base<_Tp>
{
using value_type = _Tp;
using __optional_storage_base<_Tp>::__optional_storage_base;
_LIBCPP_INLINE_VISIBILITY
__optional_storage() = default;
_LIBCPP_INLINE_VISIBILITY
__optional_storage(const __optional_storage& __opt)
{
this->__construct_from(__opt);
}
_LIBCPP_INLINE_VISIBILITY
__optional_storage(__optional_storage&& __opt)
noexcept(is_nothrow_move_constructible_v<value_type>)
{
this->__construct_from(_VSTD::move(__opt));
}
_LIBCPP_INLINE_VISIBILITY
__optional_storage& operator=(const __optional_storage& __opt)
{
this->__assign_from(__opt);
return *this;
}
_LIBCPP_INLINE_VISIBILITY
__optional_storage& operator=(__optional_storage&& __opt)
noexcept(is_nothrow_move_assignable_v<value_type> &&
is_nothrow_move_constructible_v<value_type>)
{
this->__assign_from(_VSTD::move(__opt));
return *this;
}
};
template <class _Tp>
using __optional_sfinae_ctor_base_t = __sfinae_ctor_base<
is_copy_constructible<_Tp>::value,
is_move_constructible<_Tp>::value
>;
template <class _Tp>
using __optional_sfinae_assign_base_t = __sfinae_assign_base<
(is_copy_constructible<_Tp>::value && is_copy_assignable<_Tp>::value),
(is_move_constructible<_Tp>::value && is_move_assignable<_Tp>::value)
>;
template <class _Tp>
class optional
: private __optional_storage<_Tp>
, private __optional_sfinae_ctor_base_t<_Tp>
, private __optional_sfinae_assign_base_t<_Tp>
{
using __base = __optional_storage<_Tp>;
public:
using value_type = _Tp;
private:
// Disable the reference extension using this static assert.
static_assert(!is_same_v<value_type, in_place_t>,
"instantiation of optional with in_place_t is ill-formed");
static_assert(!is_same_v<__uncvref_t<value_type>, nullopt_t>,
"instantiation of optional with nullopt_t is ill-formed");
static_assert(!is_reference_v<value_type>,
"instantiation of optional with a reference type is ill-formed");
static_assert(is_destructible_v<value_type>,
"instantiation of optional with a non-destructible type is ill-formed");
// LWG2756: conditionally explicit conversion from _Up
struct _CheckOptionalArgsConstructor {
template <class _Up>
static constexpr bool __enable_implicit() {
return is_constructible_v<_Tp, _Up&&> &&
is_convertible_v<_Up&&, _Tp>;
}
template <class _Up>
static constexpr bool __enable_explicit() {
return is_constructible_v<_Tp, _Up&&> &&
!is_convertible_v<_Up&&, _Tp>;
}
};
template <class _Up>
using _CheckOptionalArgsCtor = conditional_t<
!is_same_v<in_place_t, _Up> &&
!is_same_v<decay_t<_Up>, optional>,
_CheckOptionalArgsConstructor,
__check_tuple_constructor_fail
>;
template <class _QualUp>
struct _CheckOptionalLikeConstructor {
template <class _Up, class _Opt = optional<_Up>>
using __check_constructible_from_opt = __lazy_or<
is_constructible<_Tp, _Opt&>,
is_constructible<_Tp, _Opt const&>,
is_constructible<_Tp, _Opt&&>,
is_constructible<_Tp, _Opt const&&>,
is_convertible<_Opt&, _Tp>,
is_convertible<_Opt const&, _Tp>,
is_convertible<_Opt&&, _Tp>,
is_convertible<_Opt const&&, _Tp>
>;
template <class _Up, class _Opt = optional<_Up>>
using __check_assignable_from_opt = __lazy_or<
is_assignable<_Tp&, _Opt&>,
is_assignable<_Tp&, _Opt const&>,
is_assignable<_Tp&, _Opt&&>,
is_assignable<_Tp&, _Opt const&&>
>;
template <class _Up, class _QUp = _QualUp>
static constexpr bool __enable_implicit() {
return is_convertible<_QUp, _Tp>::value &&
!__check_constructible_from_opt<_Up>::value;
}
template <class _Up, class _QUp = _QualUp>
static constexpr bool __enable_explicit() {
return !is_convertible<_QUp, _Tp>::value &&
!__check_constructible_from_opt<_Up>::value;
}
template <class _Up, class _QUp = _QualUp>
static constexpr bool __enable_assign() {
// Construction and assignability of _Qup to _Tp has already been
// checked.
return !__check_constructible_from_opt<_Up>::value &&
!__check_assignable_from_opt<_Up>::value;
}
};
template <class _Up, class _QualUp>
using _CheckOptionalLikeCtor = conditional_t<
__lazy_and<
__lazy_not<is_same<_Up, _Tp>>,
is_constructible<_Tp, _QualUp>
>::value,
_CheckOptionalLikeConstructor<_QualUp>,
__check_tuple_constructor_fail
>;
template <class _Up, class _QualUp>
using _CheckOptionalLikeAssign = conditional_t<
__lazy_and<
__lazy_not<is_same<_Up, _Tp>>,
is_constructible<_Tp, _QualUp>,
is_assignable<_Tp&, _QualUp>
>::value,
_CheckOptionalLikeConstructor<_QualUp>,
__check_tuple_constructor_fail
>;
public:
_LIBCPP_INLINE_VISIBILITY constexpr optional() noexcept {}
_LIBCPP_INLINE_VISIBILITY optional(const optional&) = default;
_LIBCPP_INLINE_VISIBILITY optional(optional&&) = default;
_LIBCPP_INLINE_VISIBILITY constexpr optional(nullopt_t) noexcept {}
template <class... _Args, class = enable_if_t<
is_constructible_v<value_type, _Args...>>
>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit optional(in_place_t, _Args&&... __args)
: __base(in_place, _VSTD::forward<_Args>(__args)...) {}
template <class _Up, class... _Args, class = enable_if_t<
is_constructible_v<value_type, initializer_list<_Up>&, _Args...>>
>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit optional(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
: __base(in_place, __il, _VSTD::forward<_Args>(__args)...) {}
template <class _Up = value_type, enable_if_t<
_CheckOptionalArgsCtor<_Up>::template __enable_implicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
constexpr optional(_Up&& __v)
: __base(in_place, _VSTD::forward<_Up>(__v)) {}
template <class _Up, enable_if_t<
_CheckOptionalArgsCtor<_Up>::template __enable_explicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
constexpr explicit optional(_Up&& __v)
: __base(in_place, _VSTD::forward<_Up>(__v)) {}
// LWG2756: conditionally explicit conversion from const optional<_Up>&
template <class _Up, enable_if_t<
_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_implicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
optional(const optional<_Up>& __v)
{
this->__construct_from(__v);
}
template <class _Up, enable_if_t<
_CheckOptionalLikeCtor<_Up, _Up const&>::template __enable_explicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
explicit optional(const optional<_Up>& __v)
{
this->__construct_from(__v);
}
// LWG2756: conditionally explicit conversion from optional<_Up>&&
template <class _Up, enable_if_t<
_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_implicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
optional(optional<_Up>&& __v)
{
this->__construct_from(_VSTD::move(__v));
}
template <class _Up, enable_if_t<
_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_explicit<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
explicit optional(optional<_Up>&& __v)
{
this->__construct_from(_VSTD::move(__v));
}
_LIBCPP_INLINE_VISIBILITY
optional& operator=(nullopt_t) noexcept
{
reset();
return *this;
}
_LIBCPP_INLINE_VISIBILITY optional& operator=(const optional&) = default;
_LIBCPP_INLINE_VISIBILITY optional& operator=(optional&&) = default;
// LWG2756
template <class _Up = value_type,
class = enable_if_t
<__lazy_and<
integral_constant<bool,
!is_same_v<decay_t<_Up>, optional> &&
!(is_same_v<_Up, value_type> && is_scalar_v<value_type>)
>,
is_constructible<value_type, _Up>,
is_assignable<value_type&, _Up>
>::value>
>
_LIBCPP_INLINE_VISIBILITY
optional&
operator=(_Up&& __v)
{
if (this->has_value())
this->__get() = _VSTD::forward<_Up>(__v);
else
this->__construct(_VSTD::forward<_Up>(__v));
return *this;
}
// LWG2756
template <class _Up, enable_if_t<
_CheckOptionalLikeAssign<_Up, _Up const&>::template __enable_assign<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
optional&
operator=(const optional<_Up>& __v)
{
this->__assign_from(__v);
return *this;
}
// LWG2756
template <class _Up, enable_if_t<
_CheckOptionalLikeCtor<_Up, _Up &&>::template __enable_assign<_Up>()
, int> = 0>
_LIBCPP_INLINE_VISIBILITY
optional&
operator=(optional<_Up>&& __v)
{
this->__assign_from(_VSTD::move(__v));
return *this;
}
template <class... _Args,
class = enable_if_t
<
is_constructible_v<value_type, _Args...>
>
>
_LIBCPP_INLINE_VISIBILITY
void
emplace(_Args&&... __args)
{
reset();
this->__construct(_VSTD::forward<_Args>(__args)...);
}
template <class _Up, class... _Args,
class = enable_if_t
<
is_constructible_v<value_type, initializer_list<_Up>&, _Args...>
>
>
_LIBCPP_INLINE_VISIBILITY
void
emplace(initializer_list<_Up> __il, _Args&&... __args)
{
reset();
this->__construct(__il, _VSTD::forward<_Args>(__args)...);
}
_LIBCPP_INLINE_VISIBILITY
void swap(optional& __opt)
noexcept(is_nothrow_move_constructible_v<value_type> &&
is_nothrow_swappable_v<value_type>)
{
if (this->has_value() == __opt.has_value())
{
using _VSTD::swap;
if (this->has_value())
swap(this->__get(), __opt.__get());
}
else
{
if (this->has_value())
{
__opt.__construct(_VSTD::move(this->__get()));
reset();
}
else
{
this->__construct(_VSTD::move(__opt.__get()));
__opt.reset();
}
}
}
_LIBCPP_INLINE_VISIBILITY
constexpr
add_pointer_t<value_type const>
operator->() const
{
_LIBCPP_ASSERT(this->has_value(), "optional operator-> called for disengaged value");
#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
return _VSTD::addressof(this->__get());
#else
return __operator_arrow(__has_operator_addressof<value_type>{}, this->__get());
#endif
}
_LIBCPP_INLINE_VISIBILITY
constexpr
add_pointer_t<value_type>
operator->()
{
_LIBCPP_ASSERT(this->has_value(), "optional operator-> called for disengaged value");
#ifndef _LIBCPP_HAS_NO_BUILTIN_ADDRESSOF
return _VSTD::addressof(this->__get());
#else
return __operator_arrow(__has_operator_addressof<value_type>{}, this->__get());
#endif
}
_LIBCPP_INLINE_VISIBILITY
constexpr
const value_type&
operator*() const&
{
_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
return this->__get();
}
_LIBCPP_INLINE_VISIBILITY
constexpr
value_type&
operator*() &
{
_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
return this->__get();
}
_LIBCPP_INLINE_VISIBILITY
constexpr
value_type&&
operator*() &&
{
_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
return _VSTD::move(this->__get());
}
_LIBCPP_INLINE_VISIBILITY
constexpr
const value_type&&
operator*() const&&
{
_LIBCPP_ASSERT(this->has_value(), "optional operator* called for disengaged value");
return _VSTD::move(this->__get());
}
_LIBCPP_INLINE_VISIBILITY
constexpr explicit operator bool() const noexcept { return has_value(); }
using __base::has_value;
using __base::__get;
_LIBCPP_INLINE_VISIBILITY
constexpr value_type const& value() const&
{
if (!this->has_value())
__throw_bad_optional_access();
return this->__get();
}
_LIBCPP_INLINE_VISIBILITY
constexpr value_type& value() &
{
if (!this->has_value())
__throw_bad_optional_access();
return this->__get();
}
_LIBCPP_INLINE_VISIBILITY
constexpr value_type&& value() &&
{
if (!this->has_value())
__throw_bad_optional_access();
return _VSTD::move(this->__get());
}
_LIBCPP_INLINE_VISIBILITY
constexpr value_type const&& value() const&&
{
if (!this->has_value())
__throw_bad_optional_access();
return _VSTD::move(this->__get());
}
template <class _Up>
_LIBCPP_INLINE_VISIBILITY
constexpr value_type value_or(_Up&& __v) const&
{
static_assert(is_copy_constructible_v<value_type>,
"optional<T>::value_or: T must be copy constructible");
static_assert(is_convertible_v<_Up, value_type>,
"optional<T>::value_or: U must be convertible to T");
return this->has_value() ? this->__get() :
static_cast<value_type>(_VSTD::forward<_Up>(__v));
}
template <class _Up>
_LIBCPP_INLINE_VISIBILITY
value_type value_or(_Up&& __v) &&
{
static_assert(is_move_constructible_v<value_type>,
"optional<T>::value_or: T must be move constructible");
static_assert(is_convertible_v<_Up, value_type>,
"optional<T>::value_or: U must be convertible to T");
return this->has_value() ? _VSTD::move(this->__get()) :
static_cast<value_type>(_VSTD::forward<_Up>(__v));
}
using __base::reset;
private:
template <class _Up>
_LIBCPP_INLINE_VISIBILITY
static _Up*
__operator_arrow(true_type, _Up& __x)
{
return _VSTD::addressof(__x);
}
template <class _Up>
_LIBCPP_INLINE_VISIBILITY
static constexpr _Up*
__operator_arrow(false_type, _Up& __x)
{
return &__x;
}
};
// Comparisons between optionals
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator==(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (static_cast<bool>(__x) != static_cast<bool>(__y))
return false;
if (!static_cast<bool>(__x))
return true;
return *__x == *__y;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator!=(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (static_cast<bool>(__x) != static_cast<bool>(__y))
return true;
if (!static_cast<bool>(__x))
return false;
return *__x != *__y;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (!static_cast<bool>(__y))
return false;
if (!static_cast<bool>(__x))
return true;
return *__x < *__y;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (!static_cast<bool>(__x))
return false;
if (!static_cast<bool>(__y))
return true;
return *__x > *__y;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<=(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (!static_cast<bool>(__x))
return true;
if (!static_cast<bool>(__y))
return false;
return *__x <= *__y;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>=(const optional<_Tp>& __x, const optional<_Tp>& __y)
{
if (!static_cast<bool>(__y))
return true;
if (!static_cast<bool>(__x))
return false;
return *__x >= *__y;
}
// Comparisons with nullopt
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator==(const optional<_Tp>& __x, nullopt_t) noexcept
{
return !static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator==(nullopt_t, const optional<_Tp>& __x) noexcept
{
return !static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator!=(const optional<_Tp>& __x, nullopt_t) noexcept
{
return static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator!=(nullopt_t, const optional<_Tp>& __x) noexcept
{
return static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator<(const optional<_Tp>&, nullopt_t) noexcept
{
return false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator<(nullopt_t, const optional<_Tp>& __x) noexcept
{
return static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator<=(const optional<_Tp>& __x, nullopt_t) noexcept
{
return !static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator<=(nullopt_t, const optional<_Tp>&) noexcept
{
return true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator>(const optional<_Tp>& __x, nullopt_t) noexcept
{
return static_cast<bool>(__x);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator>(nullopt_t, const optional<_Tp>&) noexcept
{
return false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator>=(const optional<_Tp>&, nullopt_t) noexcept
{
return true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
bool
operator>=(nullopt_t, const optional<_Tp>& __x) noexcept
{
return !static_cast<bool>(__x);
}
// Comparisons with T
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator==(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x == __v : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() ==
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator==(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v == *__x : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator!=(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x != __v : true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() !=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator!=(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v != *__x : true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x < __v : true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v < *__x : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<=(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x <= __v : true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() <=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator<=(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v <= *__x : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x > __v : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v > *__x : true;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>=(const optional<_Tp>& __x, const _Tp& __v)
{
return static_cast<bool>(__x) ? *__x >= __v : false;
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
enable_if_t<
is_convertible_v<decltype(_VSTD::declval<const _Tp&>() >=
_VSTD::declval<const _Tp&>()), bool>,
bool
>
operator>=(const _Tp& __v, const optional<_Tp>& __x)
{
return static_cast<bool>(__x) ? __v >= *__x : true;
}
template <class _Tp>
inline _LIBCPP_INLINE_VISIBILITY
enable_if_t<
is_move_constructible_v<_Tp> && is_swappable_v<_Tp>,
void
>
swap(optional<_Tp>& __x, optional<_Tp>& __y) noexcept(noexcept(__x.swap(__y)))
{
__x.swap(__y);
}
template <class _Tp>
_LIBCPP_INLINE_VISIBILITY constexpr
optional<decay_t<_Tp>> make_optional(_Tp&& __v)
{
return optional<decay_t<_Tp>>(_VSTD::forward<_Tp>(__v));
}
template <class _Tp, class... _Args>
_LIBCPP_INLINE_VISIBILITY constexpr
optional<_Tp> make_optional(_Args&&... __args)
{
return optional<_Tp>(in_place, _VSTD::forward<_Args>(__args)...);
}
template <class _Tp, class _Up, class... _Args>
_LIBCPP_INLINE_VISIBILITY constexpr
optional<_Tp> make_optional(initializer_list<_Up> __il, _Args&&... __args)
{
return optional<_Tp>(in_place, __il, _VSTD::forward<_Args>(__args)...);
}
template <class _Tp>
struct _LIBCPP_TEMPLATE_VIS hash<optional<_Tp> >
{
typedef optional<_Tp> argument_type;
typedef size_t result_type;
_LIBCPP_INLINE_VISIBILITY
result_type operator()(const argument_type& __opt) const _NOEXCEPT
{
return static_cast<bool>(__opt) ? hash<_Tp>()(*__opt) : 0;
}
};
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP_STD_VER > 14
#endif // _LIBCPP_OPTIONAL
Index: projects/clang400-import/contrib/libc++/include/variant
===================================================================
--- projects/clang400-import/contrib/libc++/include/variant (revision 313642)
+++ projects/clang400-import/contrib/libc++/include/variant (revision 313643)
@@ -1,1568 +1,1568 @@
// -*- C++ -*-
//===------------------------------ variant -------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef _LIBCPP_VARIANT
#define _LIBCPP_VARIANT
/*
variant synopsis
namespace std {
// 20.7.2, class template variant
template <class... Types>
class variant {
public:
// 20.7.2.1, constructors
constexpr variant() noexcept(see below);
variant(const variant&);
variant(variant&&) noexcept(see below);
template <class T> constexpr variant(T&&) noexcept(see below);
template <class T, class... Args>
constexpr explicit variant(in_place_type_t<T>, Args&&...);
template <class T, class U, class... Args>
constexpr explicit variant(
in_place_type_t<T>, initializer_list<U>, Args&&...);
template <size_t I, class... Args>
constexpr explicit variant(in_place_index_t<I>, Args&&...);
template <size_t I, class U, class... Args>
constexpr explicit variant(
in_place_index_t<I>, initializer_list<U>, Args&&...);
// 20.7.2.2, destructor
~variant();
// 20.7.2.3, assignment
variant& operator=(const variant&);
variant& operator=(variant&&) noexcept(see below);
template <class T> variant& operator=(T&&) noexcept(see below);
// 20.7.2.4, modifiers
template <class T, class... Args>
void emplace(Args&&...);
template <class T, class U, class... Args>
void emplace(initializer_list<U>, Args&&...);
template <size_t I, class... Args>
void emplace(Args&&...);
template <size_t I, class U, class... Args>
void emplace(initializer_list<U>, Args&&...);
// 20.7.2.5, value status
constexpr bool valueless_by_exception() const noexcept;
constexpr size_t index() const noexcept;
// 20.7.2.6, swap
void swap(variant&) noexcept(see below);
};
// 20.7.3, variant helper classes
template <class T> struct variant_size; // undefined
template <class T>
constexpr size_t variant_size_v = variant_size<T>::value;
template <class T> struct variant_size<const T>;
template <class T> struct variant_size<volatile T>;
template <class T> struct variant_size<const volatile T>;
template <class... Types>
struct variant_size<variant<Types...>>;
template <size_t I, class T> struct variant_alternative; // undefined
template <size_t I, class T>
using variant_alternative_t = typename variant_alternative<I, T>::type;
template <size_t I, class T> struct variant_alternative<I, const T>;
template <size_t I, class T> struct variant_alternative<I, volatile T>;
template <size_t I, class T> struct variant_alternative<I, const volatile T>;
template <size_t I, class... Types>
struct variant_alternative<I, variant<Types...>>;
constexpr size_t variant_npos = -1;
// 20.7.4, value access
template <class T, class... Types>
constexpr bool holds_alternative(const variant<Types...>&) noexcept;
template <size_t I, class... Types>
constexpr variant_alternative_t<I, variant<Types...>>&
get(variant<Types...>&);
template <size_t I, class... Types>
constexpr variant_alternative_t<I, variant<Types...>>&&
get(variant<Types...>&&);
template <size_t I, class... Types>
constexpr variant_alternative_t<I, variant<Types...>> const&
get(const variant<Types...>&);
template <size_t I, class... Types>
constexpr variant_alternative_t<I, variant<Types...>> const&&
get(const variant<Types...>&&);
template <class T, class... Types>
constexpr T& get(variant<Types...>&);
template <class T, class... Types>
constexpr T&& get(variant<Types...>&&);
template <class T, class... Types>
constexpr const T& get(const variant<Types...>&);
template <class T, class... Types>
constexpr const T&& get(const variant<Types...>&&);
template <size_t I, class... Types>
constexpr add_pointer_t<variant_alternative_t<I, variant<Types...>>>
get_if(variant<Types...>*) noexcept;
template <size_t I, class... Types>
constexpr add_pointer_t<const variant_alternative_t<I, variant<Types...>>>
get_if(const variant<Types...>*) noexcept;
template <class T, class... Types>
constexpr add_pointer_t<T>
get_if(variant<Types...>*) noexcept;
template <class T, class... Types>
constexpr add_pointer_t<const T>
get_if(const variant<Types...>*) noexcept;
// 20.7.5, relational operators
template <class... Types>
constexpr bool operator==(const variant<Types...>&, const variant<Types...>&);
template <class... Types>
constexpr bool operator!=(const variant<Types...>&, const variant<Types...>&);
template <class... Types>
constexpr bool operator<(const variant<Types...>&, const variant<Types...>&);
template <class... Types>
constexpr bool operator>(const variant<Types...>&, const variant<Types...>&);
template <class... Types>
constexpr bool operator<=(const variant<Types...>&, const variant<Types...>&);
template <class... Types>
constexpr bool operator>=(const variant<Types...>&, const variant<Types...>&);
// 20.7.6, visitation
template <class Visitor, class... Variants>
constexpr see below visit(Visitor&&, Variants&&...);
// 20.7.7, class monostate
struct monostate;
// 20.7.8, monostate relational operators
constexpr bool operator<(monostate, monostate) noexcept;
constexpr bool operator>(monostate, monostate) noexcept;
constexpr bool operator<=(monostate, monostate) noexcept;
constexpr bool operator>=(monostate, monostate) noexcept;
constexpr bool operator==(monostate, monostate) noexcept;
constexpr bool operator!=(monostate, monostate) noexcept;
// 20.7.9, specialized algorithms
template <class... Types>
void swap(variant<Types...>&, variant<Types...>&) noexcept(see below);
// 20.7.10, class bad_variant_access
class bad_variant_access;
// 20.7.11, hash support
template <class T> struct hash;
template <class... Types> struct hash<variant<Types...>>;
template <> struct hash<monostate>;
} // namespace std
*/
#include <__config>
#include <__tuple>
#include <array>
#include <exception>
#include <functional>
#include <initializer_list>
#include <new>
#include <tuple>
#include <type_traits>
#include <utility>
#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
#pragma GCC system_header
#endif
namespace std { // explicitly not using versioning namespace
class _LIBCPP_EXCEPTION_ABI bad_variant_access : public exception {
public:
virtual const char* what() const _NOEXCEPT;
};
} // namespace std
_LIBCPP_BEGIN_NAMESPACE_STD
#if _LIBCPP_STD_VER > 14
_LIBCPP_NORETURN
inline _LIBCPP_INLINE_VISIBILITY
void __throw_bad_variant_access() {
#ifndef _LIBCPP_NO_EXCEPTIONS
throw bad_variant_access();
#else
_VSTD::abort();
#endif
}
template <class... _Types>
class _LIBCPP_TEMPLATE_VIS variant;
template <class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_size;
template <class _Tp>
constexpr size_t variant_size_v = variant_size<_Tp>::value;
template <class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_size<const _Tp> : variant_size<_Tp> {};
template <class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_size<volatile _Tp> : variant_size<_Tp> {};
template <class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_size<const volatile _Tp>
: variant_size<_Tp> {};
template <class... _Types>
struct _LIBCPP_TEMPLATE_VIS variant_size<variant<_Types...>>
: integral_constant<size_t, sizeof...(_Types)> {};
template <size_t _Ip, class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_alternative;
template <size_t _Ip, class _Tp>
using variant_alternative_t = typename variant_alternative<_Ip, _Tp>::type;
template <size_t _Ip, class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, const _Tp>
: add_const<variant_alternative_t<_Ip, _Tp>> {};
template <size_t _Ip, class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, volatile _Tp>
: add_volatile<variant_alternative_t<_Ip, _Tp>> {};
template <size_t _Ip, class _Tp>
struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, const volatile _Tp>
: add_cv<variant_alternative_t<_Ip, _Tp>> {};
template <size_t _Ip, class... _Types>
struct _LIBCPP_TEMPLATE_VIS variant_alternative<_Ip, variant<_Types...>> {
static_assert(_Ip < sizeof...(_Types));
using type = __type_pack_element<_Ip, _Types...>;
};
constexpr size_t variant_npos = static_cast<size_t>(-1);
constexpr unsigned int __variant_npos = static_cast<unsigned int>(-1);
namespace __find_detail {
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr size_t __find_index() {
constexpr bool __matches[] = {is_same_v<_Tp, _Types>...};
size_t __result = __not_found;
for (size_t __i = 0; __i < sizeof...(_Types); ++__i) {
if (__matches[__i]) {
if (__result != __not_found) {
return __ambiguous;
}
__result = __i;
}
}
return __result;
}
template <size_t _Index>
struct __find_unambiguous_index_sfinae_impl
: integral_constant<size_t, _Index> {};
template <>
struct __find_unambiguous_index_sfinae_impl<__not_found> {};
template <>
struct __find_unambiguous_index_sfinae_impl<__ambiguous> {};
template <class _Tp, class... _Types>
struct __find_unambiguous_index_sfinae
: __find_unambiguous_index_sfinae_impl<__find_index<_Tp, _Types...>()> {};
} // namespace __find_detail
namespace __variant_detail {
struct __valueless_t {};
enum class _Trait { _TriviallyAvailable, _Available, _Unavailable };
template <typename _Tp,
template <typename> class _IsTriviallyAvailable,
template <typename> class _IsAvailable>
constexpr _Trait __trait =
_IsTriviallyAvailable<_Tp>::value
? _Trait::_TriviallyAvailable
: _IsAvailable<_Tp>::value ? _Trait::_Available : _Trait::_Unavailable;
inline _LIBCPP_INLINE_VISIBILITY
constexpr _Trait __common_trait(initializer_list<_Trait> __traits) {
_Trait __result = _Trait::_TriviallyAvailable;
for (_Trait __t : __traits) {
if (static_cast<int>(__t) > static_cast<int>(__result)) {
__result = __t;
}
}
return __result;
}
template <typename... _Types>
struct __traits {
static constexpr _Trait __copy_constructible_trait =
__common_trait({__trait<_Types,
is_trivially_copy_constructible,
is_copy_constructible>...});
static constexpr _Trait __move_constructible_trait =
__common_trait({__trait<_Types,
is_trivially_move_constructible,
is_move_constructible>...});
static constexpr _Trait __copy_assignable_trait = __common_trait(
{__copy_constructible_trait,
__move_constructible_trait,
__trait<_Types, is_trivially_copy_assignable, is_copy_assignable>...});
static constexpr _Trait __move_assignable_trait = __common_trait(
{__move_constructible_trait,
__trait<_Types, is_trivially_move_assignable, is_move_assignable>...});
static constexpr _Trait __destructible_trait = __common_trait(
{__trait<_Types, is_trivially_destructible, is_destructible>...});
};
namespace __access {
struct __union {
template <class _Vp>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<0>) {
return _VSTD::forward<_Vp>(__v).__head;
}
template <class _Vp, size_t _Ip>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __get_alt(_Vp&& __v, in_place_index_t<_Ip>) {
return __get_alt(_VSTD::forward<_Vp>(__v).__tail, in_place_index<_Ip - 1>);
}
};
struct __base {
template <size_t _Ip, class _Vp>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __get_alt(_Vp&& __v) {
return __union::__get_alt(_VSTD::forward<_Vp>(__v).__data,
in_place_index<_Ip>);
}
};
struct __variant {
template <size_t _Ip, class _Vp>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __get_alt(_Vp&& __v) {
return __base::__get_alt<_Ip>(_VSTD::forward<_Vp>(__v).__impl);
}
};
} // namespace __access
namespace __visitation {
struct __base {
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto)
__visit_alt_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
constexpr auto __fdiagonal =
__make_fdiagonal<_Visitor&&,
decltype(_VSTD::forward<_Vs>(__vs).__as_base())...>();
return __fdiagonal[__index](_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Vs>(__vs).__as_base()...);
}
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto) __visit_alt(_Visitor&& __visitor,
_Vs&&... __vs) {
constexpr auto __fmatrix =
__make_fmatrix<_Visitor&&,
decltype(_VSTD::forward<_Vs>(__vs).__as_base())...>();
const size_t __indices[] = {__vs.index()...};
return __at(__fmatrix, __indices)(_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Vs>(__vs).__as_base()...);
}
private:
template <class _Tp>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr const _Tp& __at_impl(const _Tp& __elem, const size_t*) {
return __elem;
}
template <class _Tp, size_t _Np>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __at_impl(const array<_Tp, _Np>& __elems,
const size_t* __index) {
return __at_impl(__elems[*__index], __index + 1);
}
template <class _Tp, size_t _Np, size_t _Ip>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __at(const array<_Tp, _Np>& __elems,
const size_t (&__indices)[_Ip]) {
return __at_impl(__elems, begin(__indices));
}
template <class _Fp, class... _Fs>
static constexpr void __std_visit_visitor_return_type_check() {
static_assert(
__all<is_same_v<_Fp, _Fs>...>::value,
"`std::visit` requires the visitor to have a single return type.");
}
template <class... _Fs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_farray(_Fs&&... __fs) {
__std_visit_visitor_return_type_check<decay_t<_Fs>...>();
using __result = array<common_type_t<decay_t<_Fs>...>, sizeof...(_Fs)>;
return __result{{_VSTD::forward<_Fs>(__fs)...}};
}
template <class _Fp, class... _Vs, size_t... _Is>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_dispatch(index_sequence<_Is...>) {
struct __dispatcher {
static constexpr decltype(auto) __dispatch(_Fp __f, _Vs... __vs) {
return __invoke_constexpr(
static_cast<_Fp>(__f),
__access::__base::__get_alt<_Is>(static_cast<_Vs>(__vs))...);
}
};
return _VSTD::addressof(__dispatcher::__dispatch);
}
template <size_t _Ip, class _Fp, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fdiagonal_impl() {
return __make_dispatch<_Fp, _Vs...>(
index_sequence<(__identity<_Vs>{}, _Ip)...>{});
}
template <class _Fp, class... _Vs, size_t... _Is>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fdiagonal_impl(index_sequence<_Is...>) {
return __base::__make_farray(__make_fdiagonal_impl<_Is, _Fp, _Vs...>()...);
}
template <class _Fp, class _Vp, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fdiagonal() {
constexpr size_t _Np = decay_t<_Vp>::__size();
static_assert(__all<(_Np == decay_t<_Vs>::__size())...>::value);
return __make_fdiagonal_impl<_Fp, _Vp, _Vs...>(make_index_sequence<_Np>{});
}
template <class _Fp, class... _Vs, size_t... _Is>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fmatrix_impl(index_sequence<_Is...> __is) {
return __make_dispatch<_Fp, _Vs...>(__is);
}
template <class _Fp, class... _Vs, size_t... _Is, size_t... _Js, class... _Ls>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fmatrix_impl(index_sequence<_Is...>,
index_sequence<_Js...>,
_Ls... __ls) {
return __base::__make_farray(__make_fmatrix_impl<_Fp, _Vs...>(
index_sequence<_Is..., _Js>{}, __ls...)...);
}
template <class _Fp, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_fmatrix() {
return __make_fmatrix_impl<_Fp, _Vs...>(
index_sequence<>{}, make_index_sequence<decay_t<_Vs>::__size()>{}...);
}
};
struct __variant {
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto)
__visit_alt_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
return __base::__visit_alt_at(__index,
_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Vs>(__vs).__impl...);
}
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto) __visit_alt(_Visitor&& __visitor,
_Vs&&... __vs) {
return __base::__visit_alt(_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Vs>(__vs).__impl...);
}
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto)
__visit_value_at(size_t __index, _Visitor&& __visitor, _Vs&&... __vs) {
return __visit_alt_at(
__index,
__make_value_visitor(_VSTD::forward<_Visitor>(__visitor)),
_VSTD::forward<_Vs>(__vs)...);
}
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr decltype(auto) __visit_value(_Visitor&& __visitor,
_Vs&&... __vs) {
return __visit_alt(
__make_value_visitor(_VSTD::forward<_Visitor>(__visitor)),
_VSTD::forward<_Vs>(__vs)...);
}
private:
template <class _Visitor, class... _Values>
static constexpr void __std_visit_exhaustive_visitor_check() {
static_assert(is_callable_v<_Visitor(_Values...)>,
"`std::visit` requires the visitor to be exhaustive.");
}
template <class _Visitor>
struct __value_visitor {
template <class... _Alts>
inline _LIBCPP_INLINE_VISIBILITY
constexpr decltype(auto) operator()(_Alts&&... __alts) const {
__std_visit_exhaustive_visitor_check<
_Visitor,
- decltype(_VSTD::forward<_Alts>(__alts).__value)...>();
+ decltype((_VSTD::forward<_Alts>(__alts).__value))...>();
return __invoke_constexpr(_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Alts>(__alts).__value...);
}
_Visitor&& __visitor;
};
template <class _Visitor>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto __make_value_visitor(_Visitor&& __visitor) {
return __value_visitor<_Visitor>{_VSTD::forward<_Visitor>(__visitor)};
}
};
} // namespace __visitation
template <size_t _Index, class _Tp>
struct _LIBCPP_TEMPLATE_VIS __alt {
using __value_type = _Tp;
template <class... _Args>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr __alt(in_place_t, _Args&&... __args)
: __value(_VSTD::forward<_Args>(__args)...) {}
__value_type __value;
};
template <_Trait _DestructibleTrait, size_t _Index, class... _Types>
union _LIBCPP_TEMPLATE_VIS __union;
template <_Trait _DestructibleTrait, size_t _Index>
union _LIBCPP_TEMPLATE_VIS __union<_DestructibleTrait, _Index> {};
#define _LIBCPP_VARIANT_UNION(destructible_trait, destructor) \
template <size_t _Index, class _Tp, class... _Types> \
union _LIBCPP_TEMPLATE_VIS __union<destructible_trait, \
_Index, \
_Tp, \
_Types...> { \
public: \
inline _LIBCPP_INLINE_VISIBILITY \
explicit constexpr __union(__valueless_t) noexcept : __dummy{} {} \
\
template <class... _Args> \
inline _LIBCPP_INLINE_VISIBILITY \
explicit constexpr __union(in_place_index_t<0>, _Args&&... __args) \
: __head(in_place, _VSTD::forward<_Args>(__args)...) {} \
\
template <size_t _Ip, class... _Args> \
inline _LIBCPP_INLINE_VISIBILITY \
explicit constexpr __union(in_place_index_t<_Ip>, _Args&&... __args) \
: __tail(in_place_index<_Ip - 1>, _VSTD::forward<_Args>(__args)...) {} \
\
__union(const __union&) = default; \
__union(__union&&) = default; \
\
destructor \
\
__union& operator=(const __union&) = default; \
__union& operator=(__union&&) = default; \
\
private: \
char __dummy; \
__alt<_Index, _Tp> __head; \
__union<destructible_trait, _Index + 1, _Types...> __tail; \
\
friend struct __access::__union; \
}
_LIBCPP_VARIANT_UNION(_Trait::_TriviallyAvailable, ~__union() = default;);
_LIBCPP_VARIANT_UNION(_Trait::_Available, ~__union() {});
_LIBCPP_VARIANT_UNION(_Trait::_Unavailable, ~__union() = delete;);
#undef _LIBCPP_VARIANT_UNION
template <_Trait _DestructibleTrait, class... _Types>
class _LIBCPP_TEMPLATE_VIS __base {
public:
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr __base(__valueless_t tag) noexcept
: __data(tag), __index(__variant_npos) {}
template <size_t _Ip, class... _Args>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr __base(in_place_index_t<_Ip>, _Args&&... __args)
:
__data(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...),
__index(_Ip) {}
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool valueless_by_exception() const noexcept {
return index() == variant_npos;
}
inline _LIBCPP_INLINE_VISIBILITY
constexpr size_t index() const noexcept {
return __index == __variant_npos ? variant_npos : __index;
}
protected:
inline _LIBCPP_INLINE_VISIBILITY
constexpr auto&& __as_base() & { return *this; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr auto&& __as_base() && { return _VSTD::move(*this); }
inline _LIBCPP_INLINE_VISIBILITY
constexpr auto&& __as_base() const & { return *this; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr auto&& __as_base() const && { return _VSTD::move(*this); }
inline _LIBCPP_INLINE_VISIBILITY
static constexpr size_t __size() { return sizeof...(_Types); }
__union<_DestructibleTrait, 0, _Types...> __data;
unsigned int __index;
friend struct __access::__base;
friend struct __visitation::__base;
};
template <class _Traits, _Trait = _Traits::__destructible_trait>
class _LIBCPP_TEMPLATE_VIS __destructor;
#define _LIBCPP_VARIANT_DESTRUCTOR(destructible_trait, destructor, destroy) \
template <class... _Types> \
class _LIBCPP_TEMPLATE_VIS __destructor<__traits<_Types...>, \
destructible_trait> \
: public __base<destructible_trait, _Types...> { \
using __base_type = __base<destructible_trait, _Types...>; \
\
public: \
using __base_type::__base_type; \
using __base_type::operator=; \
\
__destructor(const __destructor&) = default; \
__destructor(__destructor&&) = default; \
destructor \
__destructor& operator=(const __destructor&) = default; \
__destructor& operator=(__destructor&&) = default; \
\
protected: \
inline _LIBCPP_INLINE_VISIBILITY \
destroy \
}
_LIBCPP_VARIANT_DESTRUCTOR(
_Trait::_TriviallyAvailable,
~__destructor() = default;,
void __destroy() noexcept { this->__index = __variant_npos; });
_LIBCPP_VARIANT_DESTRUCTOR(
_Trait::_Available,
~__destructor() { __destroy(); },
void __destroy() noexcept {
if (!this->valueless_by_exception()) {
__visitation::__base::__visit_alt(
[](auto& __alt) noexcept {
using __alt_type = decay_t<decltype(__alt)>;
__alt.~__alt_type();
},
*this);
}
this->__index = __variant_npos;
});
_LIBCPP_VARIANT_DESTRUCTOR(
_Trait::_Unavailable,
~__destructor() = delete;,
void __destroy() noexcept = delete;);
#undef _LIBCPP_VARIANT_DESTRUCTOR
template <class _Traits>
class _LIBCPP_TEMPLATE_VIS __constructor : public __destructor<_Traits> {
using __base_type = __destructor<_Traits>;
public:
using __base_type::__base_type;
using __base_type::operator=;
protected:
template <size_t _Ip, class _Tp, class... _Args>
inline _LIBCPP_INLINE_VISIBILITY
static void __construct_alt(__alt<_Ip, _Tp>& __a, _Args&&... __args) {
::new (_VSTD::addressof(__a))
__alt<_Ip, _Tp>(in_place, _VSTD::forward<_Args>(__args)...);
}
template <class _Rhs>
inline _LIBCPP_INLINE_VISIBILITY
static void __generic_construct(__constructor& __lhs, _Rhs&& __rhs) {
__lhs.__destroy();
if (!__rhs.valueless_by_exception()) {
__visitation::__base::__visit_alt_at(
__rhs.index(),
[](auto& __lhs_alt, auto&& __rhs_alt) {
__construct_alt(
__lhs_alt,
_VSTD::forward<decltype(__rhs_alt)>(__rhs_alt).__value);
},
__lhs, _VSTD::forward<_Rhs>(__rhs));
__lhs.__index = __rhs.index();
}
}
};
template <class _Traits, _Trait = _Traits::__move_constructible_trait>
class _LIBCPP_TEMPLATE_VIS __move_constructor;
#define _LIBCPP_VARIANT_MOVE_CONSTRUCTOR(move_constructible_trait, \
move_constructor) \
template <class... _Types> \
class _LIBCPP_TEMPLATE_VIS __move_constructor<__traits<_Types...>, \
move_constructible_trait> \
: public __constructor<__traits<_Types...>> { \
using __base_type = __constructor<__traits<_Types...>>; \
\
public: \
using __base_type::__base_type; \
using __base_type::operator=; \
\
__move_constructor(const __move_constructor&) = default; \
move_constructor \
~__move_constructor() = default; \
__move_constructor& operator=(const __move_constructor&) = default; \
__move_constructor& operator=(__move_constructor&&) = default; \
}
_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
_Trait::_TriviallyAvailable,
__move_constructor(__move_constructor&& __that) = default;);
_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
_Trait::_Available,
__move_constructor(__move_constructor&& __that) noexcept(
__all<is_nothrow_move_constructible_v<_Types>...>::value)
: __move_constructor(__valueless_t{}) {
this->__generic_construct(*this, _VSTD::move(__that));
});
_LIBCPP_VARIANT_MOVE_CONSTRUCTOR(
_Trait::_Unavailable,
__move_constructor(__move_constructor&&) = delete;);
#undef _LIBCPP_VARIANT_MOVE_CONSTRUCTOR
template <class _Traits, _Trait = _Traits::__copy_constructible_trait>
class _LIBCPP_TEMPLATE_VIS __copy_constructor;
#define _LIBCPP_VARIANT_COPY_CONSTRUCTOR(copy_constructible_trait, \
copy_constructor) \
template <class... _Types> \
class _LIBCPP_TEMPLATE_VIS __copy_constructor<__traits<_Types...>, \
copy_constructible_trait> \
: public __move_constructor<__traits<_Types...>> { \
using __base_type = __move_constructor<__traits<_Types...>>; \
\
public: \
using __base_type::__base_type; \
using __base_type::operator=; \
\
copy_constructor \
__copy_constructor(__copy_constructor&&) = default; \
~__copy_constructor() = default; \
__copy_constructor& operator=(const __copy_constructor&) = default; \
__copy_constructor& operator=(__copy_constructor&&) = default; \
}
_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
_Trait::_TriviallyAvailable,
__copy_constructor(const __copy_constructor& __that) = default;);
_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
_Trait::_Available,
__copy_constructor(const __copy_constructor& __that)
: __copy_constructor(__valueless_t{}) {
this->__generic_construct(*this, __that);
});
_LIBCPP_VARIANT_COPY_CONSTRUCTOR(
_Trait::_Unavailable,
__copy_constructor(const __copy_constructor&) = delete;);
#undef _LIBCPP_VARIANT_COPY_CONSTRUCTOR
template <class _Traits>
class _LIBCPP_TEMPLATE_VIS __assignment : public __copy_constructor<_Traits> {
using __base_type = __copy_constructor<_Traits>;
public:
using __base_type::__base_type;
using __base_type::operator=;
template <size_t _Ip, class... _Args>
inline _LIBCPP_INLINE_VISIBILITY
void __emplace(_Args&&... __args) {
this->__destroy();
this->__construct_alt(__access::__base::__get_alt<_Ip>(*this),
_VSTD::forward<_Args>(__args)...);
this->__index = _Ip;
}
protected:
template <bool _CopyAssign, size_t _Ip, class _Tp, class _Arg>
inline _LIBCPP_INLINE_VISIBILITY
void __assign_alt(__alt<_Ip, _Tp>& __a,
_Arg&& __arg,
bool_constant<_CopyAssign> __tag) {
if (this->index() == _Ip) {
__a.__value = _VSTD::forward<_Arg>(__arg);
} else {
struct {
void operator()(true_type) const {
__this->__emplace<_Ip>(_Tp(_VSTD::forward<_Arg>(__arg)));
}
void operator()(false_type) const {
__this->__emplace<_Ip>(_VSTD::forward<_Arg>(__arg));
}
__assignment* __this;
_Arg&& __arg;
} __impl{this, _VSTD::forward<_Arg>(__arg)};
__impl(__tag);
}
}
template <class _That>
inline _LIBCPP_INLINE_VISIBILITY
void __generic_assign(_That&& __that) {
if (this->valueless_by_exception() && __that.valueless_by_exception()) {
// do nothing.
} else if (__that.valueless_by_exception()) {
this->__destroy();
} else {
__visitation::__base::__visit_alt_at(
__that.index(),
[this](auto& __this_alt, auto&& __that_alt) {
this->__assign_alt(
__this_alt,
_VSTD::forward<decltype(__that_alt)>(__that_alt).__value,
is_lvalue_reference<_That>{});
},
*this, _VSTD::forward<_That>(__that));
}
}
};
template <class _Traits, _Trait = _Traits::__move_assignable_trait>
class _LIBCPP_TEMPLATE_VIS __move_assignment;
#define _LIBCPP_VARIANT_MOVE_ASSIGNMENT(move_assignable_trait, \
move_assignment) \
template <class... _Types> \
class _LIBCPP_TEMPLATE_VIS __move_assignment<__traits<_Types...>, \
move_assignable_trait> \
: public __assignment<__traits<_Types...>> { \
using __base_type = __assignment<__traits<_Types...>>; \
\
public: \
using __base_type::__base_type; \
using __base_type::operator=; \
\
__move_assignment(const __move_assignment&) = default; \
__move_assignment(__move_assignment&&) = default; \
~__move_assignment() = default; \
__move_assignment& operator=(const __move_assignment&) = default; \
move_assignment \
}
_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
_Trait::_TriviallyAvailable,
__move_assignment& operator=(__move_assignment&& __that) = default;);
_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
_Trait::_Available,
__move_assignment& operator=(__move_assignment&& __that) noexcept(
__all<(is_nothrow_move_constructible_v<_Types> &&
is_nothrow_move_assignable_v<_Types>)...>::value) {
this->__generic_assign(_VSTD::move(__that));
return *this;
});
_LIBCPP_VARIANT_MOVE_ASSIGNMENT(
_Trait::_Unavailable,
__move_assignment& operator=(__move_assignment&&) = delete;);
#undef _LIBCPP_VARIANT_MOVE_ASSIGNMENT
template <class _Traits, _Trait = _Traits::__copy_assignable_trait>
class _LIBCPP_TEMPLATE_VIS __copy_assignment;
#define _LIBCPP_VARIANT_COPY_ASSIGNMENT(copy_assignable_trait, \
copy_assignment) \
template <class... _Types> \
class _LIBCPP_TEMPLATE_VIS __copy_assignment<__traits<_Types...>, \
copy_assignable_trait> \
: public __move_assignment<__traits<_Types...>> { \
using __base_type = __move_assignment<__traits<_Types...>>; \
\
public: \
using __base_type::__base_type; \
using __base_type::operator=; \
\
__copy_assignment(const __copy_assignment&) = default; \
__copy_assignment(__copy_assignment&&) = default; \
~__copy_assignment() = default; \
copy_assignment \
__copy_assignment& operator=(__copy_assignment&&) = default; \
}
_LIBCPP_VARIANT_COPY_ASSIGNMENT(
_Trait::_TriviallyAvailable,
__copy_assignment& operator=(const __copy_assignment& __that) = default;);
_LIBCPP_VARIANT_COPY_ASSIGNMENT(
_Trait::_Available,
__copy_assignment& operator=(const __copy_assignment& __that) {
this->__generic_assign(__that);
return *this;
});
_LIBCPP_VARIANT_COPY_ASSIGNMENT(
_Trait::_Unavailable,
__copy_assignment& operator=(const __copy_assignment&) = delete;);
#undef _LIBCPP_VARIANT_COPY_ASSIGNMENT
template <class... _Types>
class _LIBCPP_TEMPLATE_VIS __impl
: public __copy_assignment<__traits<_Types...>> {
using __base_type = __copy_assignment<__traits<_Types...>>;
public:
using __base_type::__base_type;
using __base_type::operator=;
template <size_t _Ip, class _Arg>
inline _LIBCPP_INLINE_VISIBILITY
void __assign(_Arg&& __arg) {
this->__assign_alt(__access::__base::__get_alt<_Ip>(*this),
_VSTD::forward<_Arg>(__arg),
false_type{});
}
inline _LIBCPP_INLINE_VISIBILITY
void __swap(__impl& __that) {
if (this->valueless_by_exception() && __that.valueless_by_exception()) {
// do nothing.
} else if (this->index() == __that.index()) {
__visitation::__base::__visit_alt_at(
this->index(),
[](auto& __this_alt, auto& __that_alt) {
using _VSTD::swap;
swap(__this_alt.__value, __that_alt.__value);
},
*this,
__that);
} else {
__impl* __lhs = this;
__impl* __rhs = _VSTD::addressof(__that);
if (__lhs->__move_nothrow() && !__rhs->__move_nothrow()) {
_VSTD::swap(__lhs, __rhs);
}
__impl __tmp(_VSTD::move(*__rhs));
#ifndef _LIBCPP_NO_EXCEPTIONS
// EXTENSION: When the move construction of `__lhs` into `__rhs` throws
// and `__tmp` is nothrow move constructible then we move `__tmp` back
// into `__rhs` and provide the strong exception safety guarentee.
try {
this->__generic_construct(*__rhs, _VSTD::move(*__lhs));
} catch (...) {
if (__tmp.__move_nothrow()) {
this->__generic_construct(*__rhs, _VSTD::move(__tmp));
}
throw;
}
#else
this->__generic_construct(*__rhs, _VSTD::move(*__lhs));
#endif
this->__generic_construct(*__lhs, _VSTD::move(__tmp));
}
}
private:
inline _LIBCPP_INLINE_VISIBILITY
bool __move_nothrow() const {
constexpr bool __results[] = {is_nothrow_move_constructible_v<_Types>...};
return this->valueless_by_exception() || __results[this->index()];
}
};
template <class... _Types>
struct __overload;
template <>
struct __overload<> { void operator()() const; };
template <class _Tp, class... _Types>
struct __overload<_Tp, _Types...> : __overload<_Types...> {
using __overload<_Types...>::operator();
__identity<_Tp> operator()(_Tp) const;
};
template <class _Tp, class... _Types>
using __best_match_t = typename result_of_t<__overload<_Types...>(_Tp&&)>::type;
} // __variant_detail
template <class... _Types>
class _LIBCPP_TEMPLATE_VIS variant
: private __sfinae_ctor_base<
__all<is_copy_constructible_v<_Types>...>::value,
__all<is_move_constructible_v<_Types>...>::value>,
private __sfinae_assign_base<
__all<(is_copy_constructible_v<_Types> &&
is_move_constructible_v<_Types> &&
is_copy_assignable_v<_Types>)...>::value,
__all<(is_move_constructible_v<_Types> &&
is_move_assignable_v<_Types>)...>::value> {
static_assert(0 < sizeof...(_Types),
"variant must consist of at least one alternative.");
static_assert(__all<!is_array_v<_Types>...>::value,
"variant can not have an array type as an alternative.");
static_assert(__all<!is_reference_v<_Types>...>::value,
"variant can not have a reference type as an alternative.");
static_assert(__all<!is_void_v<_Types>...>::value,
"variant can not have a void type as an alternative.");
using __first_type = variant_alternative_t<0, variant>;
public:
template <bool _Dummy = true,
enable_if_t<__dependent_type<is_default_constructible<__first_type>,
_Dummy>::value,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
constexpr variant() noexcept(is_nothrow_default_constructible_v<__first_type>)
: __impl(in_place_index<0>) {}
variant(const variant&) = default;
variant(variant&&) = default;
template <
class _Arg,
enable_if_t<!is_same_v<decay_t<_Arg>, variant>, int> = 0,
class _Tp = __variant_detail::__best_match_t<_Arg, _Types...>,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_constructible_v<_Tp, _Arg>, int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
constexpr variant(_Arg&& __arg) noexcept(
is_nothrow_constructible_v<_Tp, _Arg>)
: __impl(in_place_index<_Ip>, _VSTD::forward<_Arg>(__arg)) {}
template <size_t _Ip, class... _Args,
enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr variant(
in_place_index_t<_Ip>,
_Args&&... __args) noexcept(is_nothrow_constructible_v<_Tp, _Args...>)
: __impl(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...) {}
template <
size_t _Ip,
class _Up,
class... _Args,
enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr variant(
in_place_index_t<_Ip>,
initializer_list<_Up> __il,
_Args&&... __args) noexcept(
is_nothrow_constructible_v<_Tp, initializer_list<_Up>&, _Args...>)
: __impl(in_place_index<_Ip>, __il, _VSTD::forward<_Args>(__args)...) {}
template <
class _Tp,
class... _Args,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr variant(in_place_type_t<_Tp>, _Args&&... __args) noexcept(
is_nothrow_constructible_v<_Tp, _Args...>)
: __impl(in_place_index<_Ip>, _VSTD::forward<_Args>(__args)...) {}
template <
class _Tp,
class _Up,
class... _Args,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
explicit constexpr variant(
in_place_type_t<_Tp>,
initializer_list<_Up> __il,
_Args&&... __args) noexcept(
is_nothrow_constructible_v<_Tp, initializer_list< _Up>&, _Args...>)
: __impl(in_place_index<_Ip>, __il, _VSTD::forward<_Args>(__args)...) {}
~variant() = default;
variant& operator=(const variant&) = default;
variant& operator=(variant&&) = default;
template <
class _Arg,
enable_if_t<!is_same_v<decay_t<_Arg>, variant>, int> = 0,
class _Tp = __variant_detail::__best_match_t<_Arg, _Types...>,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_assignable_v<_Tp&, _Arg> && is_constructible_v<_Tp, _Arg>,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
variant& operator=(_Arg&& __arg) noexcept(
is_nothrow_assignable_v<_Tp&, _Arg> &&
is_nothrow_constructible_v<_Tp, _Arg>) {
__impl.template __assign<_Ip>(_VSTD::forward<_Arg>(__arg));
return *this;
}
template <
size_t _Ip,
class... _Args,
enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
void emplace(_Args&&... __args) {
__impl.template __emplace<_Ip>(_VSTD::forward<_Args>(__args)...);
}
template <
size_t _Ip,
class _Up,
class... _Args,
enable_if_t<(_Ip < sizeof...(_Types)), int> = 0,
class _Tp = variant_alternative_t<_Ip, variant<_Types...>>,
enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
void emplace(initializer_list<_Up> __il, _Args&&... __args) {
__impl.template __emplace<_Ip>(__il, _VSTD::forward<_Args>(__args)...);
}
template <
class _Tp,
class... _Args,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_constructible_v<_Tp, _Args...>, int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
void emplace(_Args&&... __args) {
__impl.template __emplace<_Ip>(_VSTD::forward<_Args>(__args)...);
}
template <
class _Tp,
class _Up,
class... _Args,
size_t _Ip =
__find_detail::__find_unambiguous_index_sfinae<_Tp, _Types...>::value,
enable_if_t<is_constructible_v<_Tp, initializer_list<_Up>&, _Args...>,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
void emplace(initializer_list<_Up> __il, _Args&&... __args) {
__impl.template __emplace<_Ip>(__il, _VSTD::forward<_Args>(__args)...);
}
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool valueless_by_exception() const noexcept {
return __impl.valueless_by_exception();
}
inline _LIBCPP_INLINE_VISIBILITY
constexpr size_t index() const noexcept { return __impl.index(); }
template <
bool _Dummy = true,
enable_if_t<
__all<(
__dependent_type<is_move_constructible<_Types>, _Dummy>::value &&
__dependent_type<is_swappable<_Types>, _Dummy>::value)...>::value,
int> = 0>
inline _LIBCPP_INLINE_VISIBILITY
void swap(variant& __that) noexcept(
__all<(is_nothrow_move_constructible_v<_Types> &&
is_nothrow_swappable_v<_Types>)...>::value) {
__impl.__swap(__that.__impl);
}
private:
__variant_detail::__impl<_Types...> __impl;
friend struct __variant_detail::__access::__variant;
friend struct __variant_detail::__visitation::__variant;
};
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool __holds_alternative(const variant<_Types...>& __v) noexcept {
return __v.index() == _Ip;
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool holds_alternative(const variant<_Types...>& __v) noexcept {
return __holds_alternative<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
}
template <size_t _Ip, class _Vp>
inline _LIBCPP_INLINE_VISIBILITY
static constexpr auto&& __generic_get(_Vp&& __v) {
using __variant_detail::__access::__variant;
if (!__holds_alternative<_Ip>(__v)) {
__throw_bad_variant_access();
}
return __variant::__get_alt<_Ip>(_VSTD::forward<_Vp>(__v)).__value;
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr variant_alternative_t<_Ip, variant<_Types...>>& get(
variant<_Types...>& __v) {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get<_Ip>(__v);
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr variant_alternative_t<_Ip, variant<_Types...>>&& get(
variant<_Types...>&& __v) {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get<_Ip>(_VSTD::move(__v));
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr const variant_alternative_t<_Ip, variant<_Types...>>& get(
const variant<_Types...>& __v) {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get<_Ip>(__v);
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr const variant_alternative_t<_Ip, variant<_Types...>>&& get(
const variant<_Types...>&& __v) {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get<_Ip>(_VSTD::move(__v));
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr _Tp& get(variant<_Types...>& __v) {
static_assert(!is_void_v<_Tp>);
return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr _Tp&& get(variant<_Types...>&& __v) {
static_assert(!is_void_v<_Tp>);
return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(
_VSTD::move(__v));
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr const _Tp& get(const variant<_Types...>& __v) {
static_assert(!is_void_v<_Tp>);
return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr const _Tp&& get(const variant<_Types...>&& __v) {
static_assert(!is_void_v<_Tp>);
return _VSTD::get<__find_exactly_one_t<_Tp, _Types...>::value>(
_VSTD::move(__v));
}
template <size_t _Ip, class _Vp>
inline _LIBCPP_INLINE_VISIBILITY
constexpr auto* __generic_get_if(_Vp* __v) noexcept {
using __variant_detail::__access::__variant;
return __v && __holds_alternative<_Ip>(*__v)
? _VSTD::addressof(__variant::__get_alt<_Ip>(*__v).__value)
: nullptr;
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr add_pointer_t<variant_alternative_t<_Ip, variant<_Types...>>>
get_if(variant<_Types...>* __v) noexcept {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get_if<_Ip>(__v);
}
template <size_t _Ip, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr add_pointer_t<const variant_alternative_t<_Ip, variant<_Types...>>>
get_if(const variant<_Types...>* __v) noexcept {
static_assert(_Ip < sizeof...(_Types));
static_assert(!is_void_v<variant_alternative_t<_Ip, variant<_Types...>>>);
return __generic_get_if<_Ip>(__v);
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr add_pointer_t<_Tp>
get_if(variant<_Types...>* __v) noexcept {
static_assert(!is_void_v<_Tp>);
return _VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
}
template <class _Tp, class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr add_pointer_t<const _Tp>
get_if(const variant<_Types...>* __v) noexcept {
static_assert(!is_void_v<_Tp>);
return _VSTD::get_if<__find_exactly_one_t<_Tp, _Types...>::value>(__v);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator==(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__lhs.index() != __rhs.index()) return false;
if (__lhs.valueless_by_exception()) return true;
return __variant::__visit_value_at(__lhs.index(), equal_to<>{}, __lhs, __rhs);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator!=(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__lhs.index() != __rhs.index()) return true;
if (__lhs.valueless_by_exception()) return false;
return __variant::__visit_value_at(
__lhs.index(), not_equal_to<>{}, __lhs, __rhs);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator<(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__rhs.valueless_by_exception()) return false;
if (__lhs.valueless_by_exception()) return true;
if (__lhs.index() < __rhs.index()) return true;
if (__lhs.index() > __rhs.index()) return false;
return __variant::__visit_value_at(__lhs.index(), less<>{}, __lhs, __rhs);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator>(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__lhs.valueless_by_exception()) return false;
if (__rhs.valueless_by_exception()) return true;
if (__lhs.index() > __rhs.index()) return true;
if (__lhs.index() < __rhs.index()) return false;
return __variant::__visit_value_at(__lhs.index(), greater<>{}, __lhs, __rhs);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator<=(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__lhs.valueless_by_exception()) return true;
if (__rhs.valueless_by_exception()) return false;
if (__lhs.index() < __rhs.index()) return true;
if (__lhs.index() > __rhs.index()) return false;
return __variant::__visit_value_at(
__lhs.index(), less_equal<>{}, __lhs, __rhs);
}
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator>=(const variant<_Types...>& __lhs,
const variant<_Types...>& __rhs) {
using __variant_detail::__visitation::__variant;
if (__rhs.valueless_by_exception()) return true;
if (__lhs.valueless_by_exception()) return false;
if (__lhs.index() > __rhs.index()) return true;
if (__lhs.index() < __rhs.index()) return false;
return __variant::__visit_value_at(
__lhs.index(), greater_equal<>{}, __lhs, __rhs);
}
template <class _Visitor, class... _Vs>
inline _LIBCPP_INLINE_VISIBILITY
constexpr decltype(auto) visit(_Visitor&& __visitor, _Vs&&... __vs) {
using __variant_detail::__visitation::__variant;
bool __results[] = {__vs.valueless_by_exception()...};
for (bool __result : __results) {
if (__result) {
__throw_bad_variant_access();
}
}
return __variant::__visit_value(_VSTD::forward<_Visitor>(__visitor),
_VSTD::forward<_Vs>(__vs)...);
}
struct _LIBCPP_TEMPLATE_VIS monostate {};
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator<(monostate, monostate) noexcept { return false; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator>(monostate, monostate) noexcept { return false; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator<=(monostate, monostate) noexcept { return true; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator>=(monostate, monostate) noexcept { return true; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator==(monostate, monostate) noexcept { return true; }
inline _LIBCPP_INLINE_VISIBILITY
constexpr bool operator!=(monostate, monostate) noexcept { return false; }
template <class... _Types>
inline _LIBCPP_INLINE_VISIBILITY
auto swap(variant<_Types...>& __lhs,
variant<_Types...>& __rhs) noexcept(noexcept(__lhs.swap(__rhs)))
-> decltype(__lhs.swap(__rhs)) {
__lhs.swap(__rhs);
}
template <class... _Types>
struct _LIBCPP_TEMPLATE_VIS hash<variant<_Types...>> {
using argument_type = variant<_Types...>;
using result_type = size_t;
inline _LIBCPP_INLINE_VISIBILITY
result_type operator()(const argument_type& __v) const {
using __variant_detail::__visitation::__variant;
size_t __res =
__v.valueless_by_exception()
? 299792458 // Random value chosen by the universe upon creation
: __variant::__visit_alt(
[](const auto& __alt) {
using __alt_type = decay_t<decltype(__alt)>;
using __value_type = typename __alt_type::__value_type;
return hash<__value_type>{}(__alt.__value);
},
__v);
return __hash_combine(__res, hash<size_t>{}(__v.index()));
}
};
template <>
struct _LIBCPP_TEMPLATE_VIS hash<monostate> {
using argument_type = monostate;
using result_type = size_t;
inline _LIBCPP_INLINE_VISIBILITY
result_type operator()(const argument_type&) const {
return 66740831; // return a fundamentally attractive random value.
}
};
#endif // _LIBCPP_STD_VER > 14
_LIBCPP_END_NAMESPACE_STD
#endif // _LIBCPP_VARIANT
Index: projects/clang400-import/contrib/libc++/src/optional.cpp
===================================================================
--- projects/clang400-import/contrib/libc++/src/optional.cpp (revision 313642)
+++ projects/clang400-import/contrib/libc++/src/optional.cpp (revision 313643)
@@ -1,24 +1,28 @@
//===------------------------ optional.cpp --------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is dual licensed under the MIT and the University of Illinois Open
// Source Licenses. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "optional"
#include "experimental/optional"
namespace std
{
bad_optional_access::~bad_optional_access() _NOEXCEPT = default;
+const char* bad_optional_access::what() const _NOEXCEPT {
+ return "bad_optional_access";
+ }
+
} // std
_LIBCPP_BEGIN_NAMESPACE_EXPERIMENTAL
bad_optional_access::~bad_optional_access() _NOEXCEPT = default;
_LIBCPP_END_NAMESPACE_EXPERIMENTAL
Index: projects/clang400-import/contrib/libc++
===================================================================
--- projects/clang400-import/contrib/libc++ (revision 313642)
+++ projects/clang400-import/contrib/libc++ (revision 313643)
Property changes on: projects/clang400-import/contrib/libc++
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/libc++/dist:r313300-313642
Index: projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h (revision 313642)
+++ projects/clang400-import/contrib/llvm/include/llvm/ADT/ilist_iterator.h (revision 313643)
@@ -1,185 +1,198 @@
//===- llvm/ADT/ilist_iterator.h - Intrusive List Iterator -------*- C++ -*-==//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_ADT_ILIST_ITERATOR_H
#define LLVM_ADT_ILIST_ITERATOR_H
#include "llvm/ADT/ilist_node.h"
#include <cassert>
#include <cstddef>
#include <iterator>
#include <type_traits>
namespace llvm {
namespace ilist_detail {
/// Find const-correct node types.
template <class OptionsT, bool IsConst> struct IteratorTraits;
template <class OptionsT> struct IteratorTraits<OptionsT, false> {
typedef typename OptionsT::value_type value_type;
typedef typename OptionsT::pointer pointer;
typedef typename OptionsT::reference reference;
typedef ilist_node_impl<OptionsT> *node_pointer;
typedef ilist_node_impl<OptionsT> &node_reference;
};
template <class OptionsT> struct IteratorTraits<OptionsT, true> {
typedef const typename OptionsT::value_type value_type;
typedef typename OptionsT::const_pointer pointer;
typedef typename OptionsT::const_reference reference;
typedef const ilist_node_impl<OptionsT> *node_pointer;
typedef const ilist_node_impl<OptionsT> &node_reference;
};
template <bool IsReverse> struct IteratorHelper;
template <> struct IteratorHelper<false> : ilist_detail::NodeAccess {
typedef ilist_detail::NodeAccess Access;
template <class T> static void increment(T *&I) { I = Access::getNext(*I); }
template <class T> static void decrement(T *&I) { I = Access::getPrev(*I); }
};
template <> struct IteratorHelper<true> : ilist_detail::NodeAccess {
typedef ilist_detail::NodeAccess Access;
template <class T> static void increment(T *&I) { I = Access::getPrev(*I); }
template <class T> static void decrement(T *&I) { I = Access::getNext(*I); }
};
} // end namespace ilist_detail
/// Iterator for intrusive lists based on ilist_node.
template <class OptionsT, bool IsReverse, bool IsConst>
class ilist_iterator : ilist_detail::SpecificNodeAccess<OptionsT> {
friend ilist_iterator<OptionsT, IsReverse, !IsConst>;
friend ilist_iterator<OptionsT, !IsReverse, IsConst>;
friend ilist_iterator<OptionsT, !IsReverse, !IsConst>;
typedef ilist_detail::IteratorTraits<OptionsT, IsConst> Traits;
typedef ilist_detail::SpecificNodeAccess<OptionsT> Access;
public:
typedef typename Traits::value_type value_type;
typedef typename Traits::pointer pointer;
typedef typename Traits::reference reference;
typedef ptrdiff_t difference_type;
typedef std::bidirectional_iterator_tag iterator_category;
typedef typename OptionsT::const_pointer const_pointer;
typedef typename OptionsT::const_reference const_reference;
private:
typedef typename Traits::node_pointer node_pointer;
typedef typename Traits::node_reference node_reference;
node_pointer NodePtr;
public:
/// Create from an ilist_node.
explicit ilist_iterator(node_reference N) : NodePtr(&N) {}
explicit ilist_iterator(pointer NP) : NodePtr(Access::getNodePtr(NP)) {}
explicit ilist_iterator(reference NR) : NodePtr(Access::getNodePtr(&NR)) {}
ilist_iterator() : NodePtr(nullptr) {}
// This is templated so that we can allow constructing a const iterator from
// a nonconst iterator...
template <bool RHSIsConst>
ilist_iterator(
const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS,
typename std::enable_if<IsConst || !RHSIsConst, void *>::type = nullptr)
: NodePtr(RHS.NodePtr) {}
// This is templated so that we can allow assigning to a const iterator from
// a nonconst iterator...
template <bool RHSIsConst>
typename std::enable_if<IsConst || !RHSIsConst, ilist_iterator &>::type
operator=(const ilist_iterator<OptionsT, IsReverse, RHSIsConst> &RHS) {
NodePtr = RHS.NodePtr;
return *this;
}
- /// Convert from an iterator to its reverse.
+ /// Explicit conversion between forward/reverse iterators.
///
- /// TODO: Roll this into the implicit constructor once we're sure that no one
- /// is relying on the std::reverse_iterator off-by-one semantics.
+ /// Translate between forward and reverse iterators without changing range
+ /// boundaries. The resulting iterator will dereference (and have a handle)
+ /// to the previous node, which is somewhat unexpected; but converting the
+ /// two endpoints in a range will give the same range in reverse.
+ ///
+ /// This matches std::reverse_iterator conversions.
+ explicit ilist_iterator(
+ const ilist_iterator<OptionsT, !IsReverse, IsConst> &RHS)
+ : ilist_iterator(++RHS.getReverse()) {}
+
+ /// Get a reverse iterator to the same node.
+ ///
+ /// Gives a reverse iterator that will dereference (and have a handle) to the
+ /// same node. Converting the endpoint iterators in a range will give a
+ /// different range; for range operations, use the explicit conversions.
ilist_iterator<OptionsT, !IsReverse, IsConst> getReverse() const {
if (NodePtr)
return ilist_iterator<OptionsT, !IsReverse, IsConst>(*NodePtr);
return ilist_iterator<OptionsT, !IsReverse, IsConst>();
}
/// Const-cast.
ilist_iterator<OptionsT, IsReverse, false> getNonConst() const {
if (NodePtr)
return ilist_iterator<OptionsT, IsReverse, false>(
const_cast<typename ilist_iterator<OptionsT, IsReverse,
false>::node_reference>(*NodePtr));
return ilist_iterator<OptionsT, IsReverse, false>();
}
// Accessors...
reference operator*() const {
assert(!NodePtr->isKnownSentinel());
return *Access::getValuePtr(NodePtr);
}
pointer operator->() const { return &operator*(); }
// Comparison operators
friend bool operator==(const ilist_iterator &LHS, const ilist_iterator &RHS) {
return LHS.NodePtr == RHS.NodePtr;
}
friend bool operator!=(const ilist_iterator &LHS, const ilist_iterator &RHS) {
return LHS.NodePtr != RHS.NodePtr;
}
// Increment and decrement operators...
ilist_iterator &operator--() {
NodePtr = IsReverse ? NodePtr->getNext() : NodePtr->getPrev();
return *this;
}
ilist_iterator &operator++() {
NodePtr = IsReverse ? NodePtr->getPrev() : NodePtr->getNext();
return *this;
}
ilist_iterator operator--(int) {
ilist_iterator tmp = *this;
--*this;
return tmp;
}
ilist_iterator operator++(int) {
ilist_iterator tmp = *this;
++*this;
return tmp;
}
/// Get the underlying ilist_node.
node_pointer getNodePtr() const { return static_cast<node_pointer>(NodePtr); }
/// Check for end. Only valid if ilist_sentinel_tracking<true>.
bool isEnd() const { return NodePtr ? NodePtr->isSentinel() : false; }
};
template <typename From> struct simplify_type;
/// Allow ilist_iterators to convert into pointers to a node automatically when
/// used by the dyn_cast, cast, isa mechanisms...
///
/// FIXME: remove this, since there is no implicit conversion to NodeTy.
template <class OptionsT, bool IsConst>
struct simplify_type<ilist_iterator<OptionsT, false, IsConst>> {
typedef ilist_iterator<OptionsT, false, IsConst> iterator;
typedef typename iterator::pointer SimpleType;
static SimpleType getSimplifiedValue(const iterator &Node) { return &*Node; }
};
template <class OptionsT, bool IsConst>
struct simplify_type<const ilist_iterator<OptionsT, false, IsConst>>
: simplify_type<ilist_iterator<OptionsT, false, IsConst>> {};
} // end namespace llvm
#endif // LLVM_ADT_ILIST_ITERATOR_H
Index: projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h (revision 313642)
+++ projects/clang400-import/contrib/llvm/include/llvm/CodeGen/MachineInstrBundleIterator.h (revision 313643)
@@ -1,266 +1,283 @@
//===- llvm/CodeGen/MachineInstrBundleIterator.h ----------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Defines an iterator class that bundles MachineInstr.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H
#define LLVM_CODEGEN_MACHINEINSTRBUNDLEITERATOR_H
#include "llvm/ADT/ilist.h"
#include <iterator>
namespace llvm {
template <class T, bool IsReverse> struct MachineInstrBundleIteratorTraits;
template <class T> struct MachineInstrBundleIteratorTraits<T, false> {
typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
typedef typename list_type::iterator instr_iterator;
typedef typename list_type::iterator nonconst_instr_iterator;
typedef typename list_type::const_iterator const_instr_iterator;
};
template <class T> struct MachineInstrBundleIteratorTraits<T, true> {
typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
typedef typename list_type::reverse_iterator instr_iterator;
typedef typename list_type::reverse_iterator nonconst_instr_iterator;
typedef typename list_type::const_reverse_iterator const_instr_iterator;
};
template <class T> struct MachineInstrBundleIteratorTraits<const T, false> {
typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
typedef typename list_type::const_iterator instr_iterator;
typedef typename list_type::iterator nonconst_instr_iterator;
typedef typename list_type::const_iterator const_instr_iterator;
};
template <class T> struct MachineInstrBundleIteratorTraits<const T, true> {
typedef simple_ilist<T, ilist_sentinel_tracking<true>> list_type;
typedef typename list_type::const_reverse_iterator instr_iterator;
typedef typename list_type::reverse_iterator nonconst_instr_iterator;
typedef typename list_type::const_reverse_iterator const_instr_iterator;
};
template <bool IsReverse> struct MachineInstrBundleIteratorHelper;
template <> struct MachineInstrBundleIteratorHelper<false> {
/// Get the beginning of the current bundle.
template <class Iterator> static Iterator getBundleBegin(Iterator I) {
if (!I.isEnd())
while (I->isBundledWithPred())
--I;
return I;
}
/// Get the final node of the current bundle.
template <class Iterator> static Iterator getBundleFinal(Iterator I) {
if (!I.isEnd())
while (I->isBundledWithSucc())
++I;
return I;
}
/// Increment forward ilist iterator.
template <class Iterator> static void increment(Iterator &I) {
I = std::next(getBundleFinal(I));
}
/// Decrement forward ilist iterator.
template <class Iterator> static void decrement(Iterator &I) {
I = getBundleBegin(std::prev(I));
}
};
template <> struct MachineInstrBundleIteratorHelper<true> {
/// Get the beginning of the current bundle.
template <class Iterator> static Iterator getBundleBegin(Iterator I) {
return MachineInstrBundleIteratorHelper<false>::getBundleBegin(
I.getReverse())
.getReverse();
}
/// Get the final node of the current bundle.
template <class Iterator> static Iterator getBundleFinal(Iterator I) {
return MachineInstrBundleIteratorHelper<false>::getBundleFinal(
I.getReverse())
.getReverse();
}
/// Increment reverse ilist iterator.
template <class Iterator> static void increment(Iterator &I) {
I = getBundleBegin(std::next(I));
}
/// Decrement reverse ilist iterator.
template <class Iterator> static void decrement(Iterator &I) {
I = std::prev(getBundleFinal(I));
}
};
/// MachineBasicBlock iterator that automatically skips over MIs that are
/// inside bundles (i.e. walk top level MIs only).
template <typename Ty, bool IsReverse = false>
class MachineInstrBundleIterator : MachineInstrBundleIteratorHelper<IsReverse> {
typedef MachineInstrBundleIteratorTraits<Ty, IsReverse> Traits;
typedef typename Traits::instr_iterator instr_iterator;
instr_iterator MII;
public:
typedef typename instr_iterator::value_type value_type;
typedef typename instr_iterator::difference_type difference_type;
typedef typename instr_iterator::pointer pointer;
typedef typename instr_iterator::reference reference;
typedef std::bidirectional_iterator_tag iterator_category;
typedef typename instr_iterator::const_pointer const_pointer;
typedef typename instr_iterator::const_reference const_reference;
private:
typedef typename Traits::nonconst_instr_iterator nonconst_instr_iterator;
typedef typename Traits::const_instr_iterator const_instr_iterator;
typedef MachineInstrBundleIterator<
typename nonconst_instr_iterator::value_type, IsReverse>
nonconst_iterator;
typedef MachineInstrBundleIterator<Ty, !IsReverse> reverse_iterator;
public:
MachineInstrBundleIterator(instr_iterator MI) : MII(MI) {
assert((!MI.getNodePtr() || MI.isEnd() || !MI->isBundledWithPred()) &&
"It's not legal to initialize MachineInstrBundleIterator with a "
"bundled MI");
}
MachineInstrBundleIterator(reference MI) : MII(MI) {
assert(!MI.isBundledWithPred() && "It's not legal to initialize "
"MachineInstrBundleIterator with a "
"bundled MI");
}
MachineInstrBundleIterator(pointer MI) : MII(MI) {
// FIXME: This conversion should be explicit.
assert((!MI || !MI->isBundledWithPred()) && "It's not legal to initialize "
"MachineInstrBundleIterator "
"with a bundled MI");
}
// Template allows conversion from const to nonconst.
template <class OtherTy>
MachineInstrBundleIterator(
const MachineInstrBundleIterator<OtherTy, IsReverse> &I,
typename std::enable_if<std::is_convertible<OtherTy *, Ty *>::value,
void *>::type = nullptr)
: MII(I.getInstrIterator()) {}
MachineInstrBundleIterator() : MII(nullptr) {}
+ /// Explicit conversion between forward/reverse iterators.
+ ///
+ /// Translate between forward and reverse iterators without changing range
+ /// boundaries. The resulting iterator will dereference (and have a handle)
+ /// to the previous node, which is somewhat unexpected; but converting the
+ /// two endpoints in a range will give the same range in reverse.
+ ///
+ /// This matches std::reverse_iterator conversions.
+ explicit MachineInstrBundleIterator(
+ const MachineInstrBundleIterator<Ty, !IsReverse> &I)
+ : MachineInstrBundleIterator(++I.getReverse()) {}
+
/// Get the bundle iterator for the given instruction's bundle.
static MachineInstrBundleIterator getAtBundleBegin(instr_iterator MI) {
return MachineInstrBundleIteratorHelper<IsReverse>::getBundleBegin(MI);
}
reference operator*() const { return *MII; }
pointer operator->() const { return &operator*(); }
/// Check for null.
bool isValid() const { return MII.getNodePtr(); }
friend bool operator==(const MachineInstrBundleIterator &L,
const MachineInstrBundleIterator &R) {
return L.MII == R.MII;
}
friend bool operator==(const MachineInstrBundleIterator &L,
const const_instr_iterator &R) {
return L.MII == R; // Avoid assertion about validity of R.
}
friend bool operator==(const const_instr_iterator &L,
const MachineInstrBundleIterator &R) {
return L == R.MII; // Avoid assertion about validity of L.
}
friend bool operator==(const MachineInstrBundleIterator &L,
const nonconst_instr_iterator &R) {
return L.MII == R; // Avoid assertion about validity of R.
}
friend bool operator==(const nonconst_instr_iterator &L,
const MachineInstrBundleIterator &R) {
return L == R.MII; // Avoid assertion about validity of L.
}
friend bool operator==(const MachineInstrBundleIterator &L, const_pointer R) {
return L == const_instr_iterator(R); // Avoid assertion about validity of R.
}
friend bool operator==(const_pointer L, const MachineInstrBundleIterator &R) {
return const_instr_iterator(L) == R; // Avoid assertion about validity of L.
}
friend bool operator==(const MachineInstrBundleIterator &L,
const_reference R) {
return L == &R; // Avoid assertion about validity of R.
}
friend bool operator==(const_reference L,
const MachineInstrBundleIterator &R) {
return &L == R; // Avoid assertion about validity of L.
}
friend bool operator!=(const MachineInstrBundleIterator &L,
const MachineInstrBundleIterator &R) {
return !(L == R);
}
friend bool operator!=(const MachineInstrBundleIterator &L,
const const_instr_iterator &R) {
return !(L == R);
}
friend bool operator!=(const const_instr_iterator &L,
const MachineInstrBundleIterator &R) {
return !(L == R);
}
friend bool operator!=(const MachineInstrBundleIterator &L,
const nonconst_instr_iterator &R) {
return !(L == R);
}
friend bool operator!=(const nonconst_instr_iterator &L,
const MachineInstrBundleIterator &R) {
return !(L == R);
}
friend bool operator!=(const MachineInstrBundleIterator &L, const_pointer R) {
return !(L == R);
}
friend bool operator!=(const_pointer L, const MachineInstrBundleIterator &R) {
return !(L == R);
}
friend bool operator!=(const MachineInstrBundleIterator &L,
const_reference R) {
return !(L == R);
}
friend bool operator!=(const_reference L,
const MachineInstrBundleIterator &R) {
return !(L == R);
}
// Increment and decrement operators...
MachineInstrBundleIterator &operator--() {
this->decrement(MII);
return *this;
}
MachineInstrBundleIterator &operator++() {
this->increment(MII);
return *this;
}
MachineInstrBundleIterator operator--(int) {
MachineInstrBundleIterator Temp = *this;
--*this;
return Temp;
}
MachineInstrBundleIterator operator++(int) {
MachineInstrBundleIterator Temp = *this;
++*this;
return Temp;
}
instr_iterator getInstrIterator() const { return MII; }
nonconst_iterator getNonConstIterator() const { return MII.getNonConst(); }
+ /// Get a reverse iterator to the same node.
+ ///
+ /// Gives a reverse iterator that will dereference (and have a handle) to the
+ /// same node. Converting the endpoint iterators in a range will give a
+ /// different range; for range operations, use the explicit conversions.
reverse_iterator getReverse() const { return MII.getReverse(); }
};
} // end namespace llvm
#endif
Index: projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h (revision 313642)
+++ projects/clang400-import/contrib/llvm/include/llvm/IR/PassManager.h (revision 313643)
@@ -1,1255 +1,1261 @@
//===- PassManager.h - Pass management infrastructure -----------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file
///
/// This header defines various interfaces for pass management in LLVM. There
/// is no "pass" interface in LLVM per se. Instead, an instance of any class
/// which supports a method to 'run' it over a unit of IR can be used as
/// a pass. A pass manager is generally a tool to collect a sequence of passes
/// which run over a particular IR construct, and run each of them in sequence
/// over each such construct in the containing IR construct. As there is no
/// containing IR construct for a Module, a manager for passes over modules
/// forms the base case which runs its managed passes in sequence over the
/// single module provided.
///
/// The core IR library provides managers for running passes over
/// modules and functions.
///
/// * FunctionPassManager can run over a Module, runs each pass over
/// a Function.
/// * ModulePassManager must be directly run, runs each pass over the Module.
///
/// Note that the implementations of the pass managers use concept-based
/// polymorphism as outlined in the "Value Semantics and Concept-based
/// Polymorphism" talk (or its abbreviated sibling "Inheritance Is The Base
/// Class of Evil") by Sean Parent:
/// * http://github.com/sean-parent/sean-parent.github.com/wiki/Papers-and-Presentations
/// * http://www.youtube.com/watch?v=_BpMYeUFXv8
/// * http://channel9.msdn.com/Events/GoingNative/2013/Inheritance-Is-The-Base-Class-of-Evil
///
//===----------------------------------------------------------------------===//
#ifndef LLVM_IR_PASSMANAGER_H
#define LLVM_IR_PASSMANAGER_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManagerInternal.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/TypeName.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/type_traits.h"
#include <list>
#include <memory>
#include <vector>
namespace llvm {
/// A special type used by analysis passes to provide an address that
/// identifies that particular analysis pass type.
///
/// Analysis passes should have a static data member of this type and derive
/// from the \c AnalysisInfoMixin to get a static ID method used to identify
/// the analysis in the pass management infrastructure.
struct alignas(8) AnalysisKey {};
/// A special type used to provide an address that identifies a set of related
/// analyses. These sets are primarily used below to mark sets of analyses as
/// preserved.
///
/// For example, a transformation can indicate that it preserves the CFG of a
/// function by preserving the appropriate AnalysisSetKey. An analysis that
/// depends only on the CFG can then check if that AnalysisSetKey is preserved;
/// if it is, the analysis knows that it itself is preserved.
struct alignas(8) AnalysisSetKey {};
/// A set of analyses that are preserved following a run of a transformation
/// pass.
///
/// Transformation passes build and return these objects to communicate which
/// analyses are still valid after the transformation. For most passes this is
/// fairly simple: if they don't change anything all analyses are preserved,
/// otherwise only a short list of analyses that have been explicitly updated
/// are preserved.
///
/// This class also lets transformation passes mark abstract *sets* of analyses
/// as preserved. A transformation that (say) does not alter the CFG can
/// indicate such by marking a particular AnalysisSetKey as preserved, and
/// then analyses can query whether that AnalysisSetKey is preserved.
///
/// Finally, this class can represent an "abandoned" analysis, which is
/// not preserved even if it would be covered by some abstract set of analyses.
///
/// Given a `PreservedAnalyses` object, an analysis will typically want to
/// figure out whether it is preserved. In the example below, MyAnalysisType is
/// preserved if it's not abandoned, and (a) it's explicitly marked as
/// preserved, (b), the set AllAnalysesOn<MyIRUnit> is preserved, or (c) both
/// AnalysisSetA and AnalysisSetB are preserved.
///
/// ```
/// auto PAC = PA.getChecker<MyAnalysisType>();
/// if (PAC.preserved() || PAC.preservedSet<AllAnalysesOn<MyIRUnit>>() ||
/// (PAC.preservedSet<AnalysisSetA>() &&
/// PAC.preservedSet<AnalysisSetB>())) {
/// // The analysis has been successfully preserved ...
/// }
/// ```
class PreservedAnalyses {
public:
/// \brief Convenience factory function for the empty preserved set.
static PreservedAnalyses none() { return PreservedAnalyses(); }
/// \brief Construct a special preserved set that preserves all passes.
static PreservedAnalyses all() {
PreservedAnalyses PA;
PA.PreservedIDs.insert(&AllAnalysesKey);
return PA;
}
/// Mark an analysis as preserved.
template <typename AnalysisT> void preserve() { preserve(AnalysisT::ID()); }
/// \brief Given an analysis's ID, mark the analysis as preserved, adding it
/// to the set.
void preserve(AnalysisKey *ID) {
// Clear this ID from the explicit not-preserved set if present.
NotPreservedAnalysisIDs.erase(ID);
// If we're not already preserving all analyses (other than those in
// NotPreservedAnalysisIDs).
if (!areAllPreserved())
PreservedIDs.insert(ID);
}
/// Mark an analysis set as preserved.
template <typename AnalysisSetT> void preserveSet() {
preserveSet(AnalysisSetT::ID());
}
/// Mark an analysis set as preserved using its ID.
void preserveSet(AnalysisSetKey *ID) {
// If we're not already in the saturated 'all' state, add this set.
if (!areAllPreserved())
PreservedIDs.insert(ID);
}
/// Mark an analysis as abandoned.
///
/// An abandoned analysis is not preserved, even if it is nominally covered
/// by some other set or was previously explicitly marked as preserved.
///
/// Note that you can only abandon a specific analysis, not a *set* of
/// analyses.
template <typename AnalysisT> void abandon() { abandon(AnalysisT::ID()); }
/// Mark an analysis as abandoned using its ID.
///
/// An abandoned analysis is not preserved, even if it is nominally covered
/// by some other set or was previously explicitly marked as preserved.
///
/// Note that you can only abandon a specific analysis, not a *set* of
/// analyses.
void abandon(AnalysisKey *ID) {
PreservedIDs.erase(ID);
NotPreservedAnalysisIDs.insert(ID);
}
/// \brief Intersect this set with another in place.
///
/// This is a mutating operation on this preserved set, removing all
/// preserved passes which are not also preserved in the argument.
void intersect(const PreservedAnalyses &Arg) {
if (Arg.areAllPreserved())
return;
if (areAllPreserved()) {
*this = Arg;
return;
}
// The intersection requires the *union* of the explicitly not-preserved
// IDs and the *intersection* of the preserved IDs.
for (auto ID : Arg.NotPreservedAnalysisIDs) {
PreservedIDs.erase(ID);
NotPreservedAnalysisIDs.insert(ID);
}
for (auto ID : PreservedIDs)
if (!Arg.PreservedIDs.count(ID))
PreservedIDs.erase(ID);
}
/// \brief Intersect this set with a temporary other set in place.
///
/// This is a mutating operation on this preserved set, removing all
/// preserved passes which are not also preserved in the argument.
void intersect(PreservedAnalyses &&Arg) {
if (Arg.areAllPreserved())
return;
if (areAllPreserved()) {
*this = std::move(Arg);
return;
}
// The intersection requires the *union* of the explicitly not-preserved
// IDs and the *intersection* of the preserved IDs.
for (auto ID : Arg.NotPreservedAnalysisIDs) {
PreservedIDs.erase(ID);
NotPreservedAnalysisIDs.insert(ID);
}
for (auto ID : PreservedIDs)
if (!Arg.PreservedIDs.count(ID))
PreservedIDs.erase(ID);
}
/// A checker object that makes it easy to query for whether an analysis or
/// some set covering it is preserved.
class PreservedAnalysisChecker {
friend class PreservedAnalyses;
const PreservedAnalyses &PA;
AnalysisKey *const ID;
const bool IsAbandoned;
/// A PreservedAnalysisChecker is tied to a particular Analysis because
/// `preserved()` and `preservedSet()` both return false if the Analysis
/// was abandoned.
PreservedAnalysisChecker(const PreservedAnalyses &PA, AnalysisKey *ID)
: PA(PA), ID(ID), IsAbandoned(PA.NotPreservedAnalysisIDs.count(ID)) {}
public:
/// Returns true if the checker's analysis was not abandoned and either
/// - the analysis is explicitly preserved or
/// - all analyses are preserved.
bool preserved() {
return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) ||
PA.PreservedIDs.count(ID));
}
/// Returns true if the checker's analysis was not abandoned and either
/// - \p AnalysisSetT is explicitly preserved or
/// - all analyses are preserved.
template <typename AnalysisSetT> bool preservedSet() {
AnalysisSetKey *SetID = AnalysisSetT::ID();
return !IsAbandoned && (PA.PreservedIDs.count(&AllAnalysesKey) ||
PA.PreservedIDs.count(SetID));
}
};
/// Build a checker for this `PreservedAnalyses` and the specified analysis
/// type.
///
/// You can use the returned object to query whether an analysis was
/// preserved. See the example in the comment on `PreservedAnalysis`.
template <typename AnalysisT> PreservedAnalysisChecker getChecker() const {
return PreservedAnalysisChecker(*this, AnalysisT::ID());
}
/// Build a checker for this `PreservedAnalyses` and the specified analysis
/// ID.
///
/// You can use the returned object to query whether an analysis was
/// preserved. See the example in the comment on `PreservedAnalysis`.
PreservedAnalysisChecker getChecker(AnalysisKey *ID) const {
return PreservedAnalysisChecker(*this, ID);
}
/// Test whether all analyses are preserved (and none are abandoned).
///
/// This is used primarily to optimize for the common case of a transformation
/// which makes no changes to the IR.
bool areAllPreserved() const {
return NotPreservedAnalysisIDs.empty() &&
PreservedIDs.count(&AllAnalysesKey);
}
/// Directly test whether a set of analyses is preserved.
///
/// This is only true when no analyses have been explicitly abandoned.
template <typename AnalysisSetT> bool allAnalysesInSetPreserved() const {
return allAnalysesInSetPreserved(AnalysisSetT::ID());
}
/// Directly test whether a set of analyses is preserved.
///
/// This is only true when no analyses have been explicitly abandoned.
bool allAnalysesInSetPreserved(AnalysisSetKey *SetID) const {
return NotPreservedAnalysisIDs.empty() &&
(PreservedIDs.count(&AllAnalysesKey) || PreservedIDs.count(SetID));
}
private:
/// A special key used to indicate all analyses.
static AnalysisSetKey AllAnalysesKey;
/// The IDs of analyses and analysis sets that are preserved.
SmallPtrSet<void *, 2> PreservedIDs;
/// The IDs of explicitly not-preserved analyses.
///
/// If an analysis in this set is covered by a set in `PreservedIDs`, we
/// consider it not-preserved. That is, `NotPreservedAnalysisIDs` always
/// "wins" over analysis sets in `PreservedIDs`.
///
/// Also, a given ID should never occur both here and in `PreservedIDs`.
SmallPtrSet<AnalysisKey *, 2> NotPreservedAnalysisIDs;
};
// Forward declare the analysis manager template.
template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
/// A CRTP mix-in to automatically provide informational APIs needed for
/// passes.
///
/// This provides some boilerplate for types that are passes.
template <typename DerivedT> struct PassInfoMixin {
/// Gets the name of the pass we are mixed into.
static StringRef name() {
+ static_assert(std::is_base_of<PassInfoMixin, DerivedT>::value,
+ "Must pass the derived type as the template argument!");
StringRef Name = getTypeName<DerivedT>();
if (Name.startswith("llvm::"))
Name = Name.drop_front(strlen("llvm::"));
return Name;
}
};
/// A CRTP mix-in that provides informational APIs needed for analysis passes.
///
/// This provides some boilerplate for types that are analysis passes. It
/// automatically mixes in \c PassInfoMixin.
template <typename DerivedT>
struct AnalysisInfoMixin : PassInfoMixin<DerivedT> {
/// Returns an opaque, unique ID for this analysis type.
///
/// This ID is a pointer type that is guaranteed to be 8-byte aligned and thus
/// suitable for use in sets, maps, and other data structures that use the low
/// bits of pointers.
///
/// Note that this requires the derived type provide a static \c AnalysisKey
/// member called \c Key.
///
/// FIXME: The only reason the mixin type itself can't declare the Key value
/// is that some compilers cannot correctly unique a templated static variable
/// so it has the same addresses in each instantiation. The only currently
/// known platform with this limitation is Windows DLL builds, specifically
/// building each part of LLVM as a DLL. If we ever remove that build
/// configuration, this mixin can provide the static key as well.
- static AnalysisKey *ID() { return &DerivedT::Key; }
+ static AnalysisKey *ID() {
+ static_assert(std::is_base_of<AnalysisInfoMixin, DerivedT>::value,
+ "Must pass the derived type as the template argument!");
+ return &DerivedT::Key;
+ }
};
/// This templated class represents "all analyses that operate over \<a
/// particular IR unit\>" (e.g. a Function or a Module) in instances of
/// PreservedAnalysis.
///
/// This lets a transformation say e.g. "I preserved all function analyses".
///
/// Note that you must provide an explicit instantiation declaration and
/// definition for this template in order to get the correct behavior on
/// Windows. Otherwise, the address of SetKey will not be stable.
template <typename IRUnitT>
class AllAnalysesOn {
public:
static AnalysisSetKey *ID() { return &SetKey; }
private:
static AnalysisSetKey SetKey;
};
template <typename IRUnitT> AnalysisSetKey AllAnalysesOn<IRUnitT>::SetKey;
extern template class AllAnalysesOn<Module>;
extern template class AllAnalysesOn<Function>;
/// \brief Manages a sequence of passes over a particular unit of IR.
///
/// A pass manager contains a sequence of passes to run over a particular unit
/// of IR (e.g. Functions, Modules). It is itself a valid pass over that unit of
/// IR, and when run over some given IR will run each of its contained passes in
/// sequence. Pass managers are the primary and most basic building block of a
/// pass pipeline.
///
/// When you run a pass manager, you provide an \c AnalysisManager<IRUnitT>
/// argument. The pass manager will propagate that analysis manager to each
/// pass it runs, and will call the analysis manager's invalidation routine with
/// the PreservedAnalyses of each pass it runs.
template <typename IRUnitT,
typename AnalysisManagerT = AnalysisManager<IRUnitT>,
typename... ExtraArgTs>
class PassManager : public PassInfoMixin<
PassManager<IRUnitT, AnalysisManagerT, ExtraArgTs...>> {
public:
/// \brief Construct a pass manager.
///
/// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
explicit PassManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
// FIXME: These are equivalent to the default move constructor/move
// assignment. However, using = default triggers linker errors due to the
// explicit instantiations below. Find away to use the default and remove the
// duplicated code here.
PassManager(PassManager &&Arg)
: Passes(std::move(Arg.Passes)),
DebugLogging(std::move(Arg.DebugLogging)) {}
PassManager &operator=(PassManager &&RHS) {
Passes = std::move(RHS.Passes);
DebugLogging = std::move(RHS.DebugLogging);
return *this;
}
/// \brief Run all of the passes in this manager over the given unit of IR.
/// ExtraArgs are passed to each pass.
PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
ExtraArgTs... ExtraArgs) {
PreservedAnalyses PA = PreservedAnalyses::all();
if (DebugLogging)
dbgs() << "Starting " << getTypeName<IRUnitT>() << " pass manager run.\n";
for (unsigned Idx = 0, Size = Passes.size(); Idx != Size; ++Idx) {
if (DebugLogging)
dbgs() << "Running pass: " << Passes[Idx]->name() << " on "
<< IR.getName() << "\n";
PreservedAnalyses PassPA = Passes[Idx]->run(IR, AM, ExtraArgs...);
// Update the analysis manager as each pass runs and potentially
// invalidates analyses.
AM.invalidate(IR, PassPA);
// Finally, intersect the preserved analyses to compute the aggregate
// preserved set for this pass manager.
PA.intersect(std::move(PassPA));
// FIXME: Historically, the pass managers all called the LLVM context's
// yield function here. We don't have a generic way to acquire the
// context and it isn't yet clear what the right pattern is for yielding
// in the new pass manager so it is currently omitted.
//IR.getContext().yield();
}
// Invaliadtion was handled after each pass in the above loop for the
// current unit of IR. Therefore, the remaining analysis results in the
// AnalysisManager are preserved. We mark this with a set so that we don't
// need to inspect each one individually.
PA.preserveSet<AllAnalysesOn<IRUnitT>>();
if (DebugLogging)
dbgs() << "Finished " << getTypeName<IRUnitT>() << " pass manager run.\n";
return PA;
}
template <typename PassT> void addPass(PassT Pass) {
typedef detail::PassModel<IRUnitT, PassT, PreservedAnalyses,
AnalysisManagerT, ExtraArgTs...>
PassModelT;
Passes.emplace_back(new PassModelT(std::move(Pass)));
}
private:
typedef detail::PassConcept<IRUnitT, AnalysisManagerT, ExtraArgTs...>
PassConceptT;
std::vector<std::unique_ptr<PassConceptT>> Passes;
/// \brief Flag indicating whether we should do debug logging.
bool DebugLogging;
};
extern template class PassManager<Module>;
/// \brief Convenience typedef for a pass manager over modules.
typedef PassManager<Module> ModulePassManager;
extern template class PassManager<Function>;
/// \brief Convenience typedef for a pass manager over functions.
typedef PassManager<Function> FunctionPassManager;
/// \brief A container for analyses that lazily runs them and caches their
/// results.
///
/// This class can manage analyses for any IR unit where the address of the IR
/// unit sufficies as its identity.
template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager {
public:
class Invalidator;
private:
// Now that we've defined our invalidator, we can define the concept types.
typedef detail::AnalysisResultConcept<IRUnitT, PreservedAnalyses, Invalidator>
ResultConceptT;
typedef detail::AnalysisPassConcept<IRUnitT, PreservedAnalyses, Invalidator,
ExtraArgTs...>
PassConceptT;
/// \brief List of analysis pass IDs and associated concept pointers.
///
/// Requires iterators to be valid across appending new entries and arbitrary
/// erases. Provides the analysis ID to enable finding iterators to a given
/// entry in maps below, and provides the storage for the actual result
/// concept.
typedef std::list<std::pair<AnalysisKey *, std::unique_ptr<ResultConceptT>>>
AnalysisResultListT;
/// \brief Map type from IRUnitT pointer to our custom list type.
typedef DenseMap<IRUnitT *, AnalysisResultListT> AnalysisResultListMapT;
/// \brief Map type from a pair of analysis ID and IRUnitT pointer to an
/// iterator into a particular result list (which is where the actual analysis
/// result is stored).
typedef DenseMap<std::pair<AnalysisKey *, IRUnitT *>,
typename AnalysisResultListT::iterator>
AnalysisResultMapT;
public:
/// API to communicate dependencies between analyses during invalidation.
///
/// When an analysis result embeds handles to other analysis results, it
/// needs to be invalidated both when its own information isn't preserved and
/// when any of its embedded analysis results end up invalidated. We pass an
/// \c Invalidator object as an argument to \c invalidate() in order to let
/// the analysis results themselves define the dependency graph on the fly.
/// This lets us avoid building building an explicit representation of the
/// dependencies between analysis results.
class Invalidator {
public:
/// Trigger the invalidation of some other analysis pass if not already
/// handled and return whether it was in fact invalidated.
///
/// This is expected to be called from within a given analysis result's \c
/// invalidate method to trigger a depth-first walk of all inter-analysis
/// dependencies. The same \p IR unit and \p PA passed to that result's \c
/// invalidate method should in turn be provided to this routine.
///
/// The first time this is called for a given analysis pass, it will call
/// the corresponding result's \c invalidate method. Subsequent calls will
/// use a cache of the results of that initial call. It is an error to form
/// cyclic dependencies between analysis results.
///
/// This returns true if the given analysis's result is invalid. Any
/// dependecies on it will become invalid as a result.
template <typename PassT>
bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
typedef detail::AnalysisResultModel<IRUnitT, PassT,
typename PassT::Result,
PreservedAnalyses, Invalidator>
ResultModelT;
return invalidateImpl<ResultModelT>(PassT::ID(), IR, PA);
}
/// A type-erased variant of the above invalidate method with the same core
/// API other than passing an analysis ID rather than an analysis type
/// parameter.
///
/// This is sadly less efficient than the above routine, which leverages
/// the type parameter to avoid the type erasure overhead.
bool invalidate(AnalysisKey *ID, IRUnitT &IR, const PreservedAnalyses &PA) {
return invalidateImpl<>(ID, IR, PA);
}
private:
friend class AnalysisManager;
template <typename ResultT = ResultConceptT>
bool invalidateImpl(AnalysisKey *ID, IRUnitT &IR,
const PreservedAnalyses &PA) {
// If we've already visited this pass, return true if it was invalidated
// and false otherwise.
auto IMapI = IsResultInvalidated.find(ID);
if (IMapI != IsResultInvalidated.end())
return IMapI->second;
// Otherwise look up the result object.
auto RI = Results.find({ID, &IR});
assert(RI != Results.end() &&
"Trying to invalidate a dependent result that isn't in the "
"manager's cache is always an error, likely due to a stale result "
"handle!");
auto &Result = static_cast<ResultT &>(*RI->second->second);
// Insert into the map whether the result should be invalidated and return
// that. Note that we cannot reuse IMapI and must do a fresh insert here,
// as calling invalidate could (recursively) insert things into the map,
// making any iterator or reference invalid.
bool Inserted;
std::tie(IMapI, Inserted) =
IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, *this)});
(void)Inserted;
assert(Inserted && "Should not have already inserted this ID, likely "
"indicates a dependency cycle!");
return IMapI->second;
}
Invalidator(SmallDenseMap<AnalysisKey *, bool, 8> &IsResultInvalidated,
const AnalysisResultMapT &Results)
: IsResultInvalidated(IsResultInvalidated), Results(Results) {}
SmallDenseMap<AnalysisKey *, bool, 8> &IsResultInvalidated;
const AnalysisResultMapT &Results;
};
/// \brief Construct an empty analysis manager.
///
/// If \p DebugLogging is true, we'll log our progress to llvm::dbgs().
AnalysisManager(bool DebugLogging = false) : DebugLogging(DebugLogging) {}
AnalysisManager(AnalysisManager &&) = default;
AnalysisManager &operator=(AnalysisManager &&) = default;
/// \brief Returns true if the analysis manager has an empty results cache.
bool empty() const {
assert(AnalysisResults.empty() == AnalysisResultLists.empty() &&
"The storage and index of analysis results disagree on how many "
"there are!");
return AnalysisResults.empty();
}
/// \brief Clear any cached analysis results for a single unit of IR.
///
/// This doesn't invalidate, but instead simply deletes, the relevant results.
/// It is useful when the IR is being removed and we want to clear out all the
/// memory pinned for it.
void clear(IRUnitT &IR) {
if (DebugLogging)
dbgs() << "Clearing all analysis results for: " << IR.getName() << "\n";
auto ResultsListI = AnalysisResultLists.find(&IR);
if (ResultsListI == AnalysisResultLists.end())
return;
// Delete the map entries that point into the results list.
for (auto &IDAndResult : ResultsListI->second)
AnalysisResults.erase({IDAndResult.first, &IR});
// And actually destroy and erase the results associated with this IR.
AnalysisResultLists.erase(ResultsListI);
}
/// \brief Clear all analysis results cached by this AnalysisManager.
///
/// Like \c clear(IRUnitT&), this doesn't invalidate the results; it simply
/// deletes them. This lets you clean up the AnalysisManager when the set of
/// IR units itself has potentially changed, and thus we can't even look up a
/// a result and invalidate/clear it directly.
void clear() {
AnalysisResults.clear();
AnalysisResultLists.clear();
}
/// \brief Get the result of an analysis pass for a given IR unit.
///
/// Runs the analysis if a cached result is not available.
template <typename PassT>
typename PassT::Result &getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs) {
assert(AnalysisPasses.count(PassT::ID()) &&
"This analysis pass was not registered prior to being queried");
ResultConceptT &ResultConcept =
getResultImpl(PassT::ID(), IR, ExtraArgs...);
typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
PreservedAnalyses, Invalidator>
ResultModelT;
return static_cast<ResultModelT &>(ResultConcept).Result;
}
/// \brief Get the cached result of an analysis pass for a given IR unit.
///
/// This method never runs the analysis.
///
/// \returns null if there is no cached result.
template <typename PassT>
typename PassT::Result *getCachedResult(IRUnitT &IR) const {
assert(AnalysisPasses.count(PassT::ID()) &&
"This analysis pass was not registered prior to being queried");
ResultConceptT *ResultConcept = getCachedResultImpl(PassT::ID(), IR);
if (!ResultConcept)
return nullptr;
typedef detail::AnalysisResultModel<IRUnitT, PassT, typename PassT::Result,
PreservedAnalyses, Invalidator>
ResultModelT;
return &static_cast<ResultModelT *>(ResultConcept)->Result;
}
/// \brief Register an analysis pass with the manager.
///
/// The parameter is a callable whose result is an analysis pass. This allows
/// passing in a lambda to construct the analysis.
///
/// The analysis type to register is the type returned by calling the \c
/// PassBuilder argument. If that type has already been registered, then the
/// argument will not be called and this function will return false.
/// Otherwise, we register the analysis returned by calling \c PassBuilder(),
/// and this function returns true.
///
/// (Note: Although the return value of this function indicates whether or not
/// an analysis was previously registered, there intentionally isn't a way to
/// query this directly. Instead, you should just register all the analyses
/// you might want and let this class run them lazily. This idiom lets us
/// minimize the number of times we have to look up analyses in our
/// hashtable.)
template <typename PassBuilderT>
bool registerPass(PassBuilderT &&PassBuilder) {
typedef decltype(PassBuilder()) PassT;
typedef detail::AnalysisPassModel<IRUnitT, PassT, PreservedAnalyses,
Invalidator, ExtraArgTs...>
PassModelT;
auto &PassPtr = AnalysisPasses[PassT::ID()];
if (PassPtr)
// Already registered this pass type!
return false;
// Construct a new model around the instance returned by the builder.
PassPtr.reset(new PassModelT(PassBuilder()));
return true;
}
/// \brief Invalidate a specific analysis pass for an IR module.
///
/// Note that the analysis result can disregard invalidation, if it determines
/// it is in fact still valid.
template <typename PassT> void invalidate(IRUnitT &IR) {
assert(AnalysisPasses.count(PassT::ID()) &&
"This analysis pass was not registered prior to being invalidated");
invalidateImpl(PassT::ID(), IR);
}
/// \brief Invalidate cached analyses for an IR unit.
///
/// Walk through all of the analyses pertaining to this unit of IR and
/// invalidate them, unless they are preserved by the PreservedAnalyses set.
void invalidate(IRUnitT &IR, const PreservedAnalyses &PA) {
// We're done if all analyses on this IR unit are preserved.
if (PA.allAnalysesInSetPreserved<AllAnalysesOn<IRUnitT>>())
return;
if (DebugLogging)
dbgs() << "Invalidating all non-preserved analyses for: " << IR.getName()
<< "\n";
// Track whether each analysis's result is invalidated in
// IsResultInvalidated.
SmallDenseMap<AnalysisKey *, bool, 8> IsResultInvalidated;
Invalidator Inv(IsResultInvalidated, AnalysisResults);
AnalysisResultListT &ResultsList = AnalysisResultLists[&IR];
for (auto &AnalysisResultPair : ResultsList) {
// This is basically the same thing as Invalidator::invalidate, but we
// can't call it here because we're operating on the type-erased result.
// Moreover if we instead called invalidate() directly, it would do an
// unnecessary look up in ResultsList.
AnalysisKey *ID = AnalysisResultPair.first;
auto &Result = *AnalysisResultPair.second;
auto IMapI = IsResultInvalidated.find(ID);
if (IMapI != IsResultInvalidated.end())
// This result was already handled via the Invalidator.
continue;
// Try to invalidate the result, giving it the Invalidator so it can
// recursively query for any dependencies it has and record the result.
// Note that we cannot reuse 'IMapI' here or pre-insert the ID, as
// Result.invalidate may insert things into the map, invalidating our
// iterator.
bool Inserted =
IsResultInvalidated.insert({ID, Result.invalidate(IR, PA, Inv)})
.second;
(void)Inserted;
assert(Inserted && "Should never have already inserted this ID, likely "
"indicates a cycle!");
}
// Now erase the results that were marked above as invalidated.
if (!IsResultInvalidated.empty()) {
for (auto I = ResultsList.begin(), E = ResultsList.end(); I != E;) {
AnalysisKey *ID = I->first;
if (!IsResultInvalidated.lookup(ID)) {
++I;
continue;
}
if (DebugLogging)
dbgs() << "Invalidating analysis: " << this->lookUpPass(ID).name()
<< "\n";
I = ResultsList.erase(I);
AnalysisResults.erase({ID, &IR});
}
}
if (ResultsList.empty())
AnalysisResultLists.erase(&IR);
}
private:
/// \brief Look up a registered analysis pass.
PassConceptT &lookUpPass(AnalysisKey *ID) {
typename AnalysisPassMapT::iterator PI = AnalysisPasses.find(ID);
assert(PI != AnalysisPasses.end() &&
"Analysis passes must be registered prior to being queried!");
return *PI->second;
}
/// \brief Look up a registered analysis pass.
const PassConceptT &lookUpPass(AnalysisKey *ID) const {
typename AnalysisPassMapT::const_iterator PI = AnalysisPasses.find(ID);
assert(PI != AnalysisPasses.end() &&
"Analysis passes must be registered prior to being queried!");
return *PI->second;
}
/// \brief Get an analysis result, running the pass if necessary.
ResultConceptT &getResultImpl(AnalysisKey *ID, IRUnitT &IR,
ExtraArgTs... ExtraArgs) {
typename AnalysisResultMapT::iterator RI;
bool Inserted;
std::tie(RI, Inserted) = AnalysisResults.insert(std::make_pair(
std::make_pair(ID, &IR), typename AnalysisResultListT::iterator()));
// If we don't have a cached result for this function, look up the pass and
// run it to produce a result, which we then add to the cache.
if (Inserted) {
auto &P = this->lookUpPass(ID);
if (DebugLogging)
dbgs() << "Running analysis: " << P.name() << "\n";
AnalysisResultListT &ResultList = AnalysisResultLists[&IR];
ResultList.emplace_back(ID, P.run(IR, *this, ExtraArgs...));
// P.run may have inserted elements into AnalysisResults and invalidated
// RI.
RI = AnalysisResults.find({ID, &IR});
assert(RI != AnalysisResults.end() && "we just inserted it!");
RI->second = std::prev(ResultList.end());
}
return *RI->second->second;
}
/// \brief Get a cached analysis result or return null.
ResultConceptT *getCachedResultImpl(AnalysisKey *ID, IRUnitT &IR) const {
typename AnalysisResultMapT::const_iterator RI =
AnalysisResults.find({ID, &IR});
return RI == AnalysisResults.end() ? nullptr : &*RI->second->second;
}
/// \brief Invalidate a function pass result.
void invalidateImpl(AnalysisKey *ID, IRUnitT &IR) {
typename AnalysisResultMapT::iterator RI =
AnalysisResults.find({ID, &IR});
if (RI == AnalysisResults.end())
return;
if (DebugLogging)
dbgs() << "Invalidating analysis: " << this->lookUpPass(ID).name()
<< "\n";
AnalysisResultLists[&IR].erase(RI->second);
AnalysisResults.erase(RI);
}
/// \brief Map type from module analysis pass ID to pass concept pointer.
typedef DenseMap<AnalysisKey *, std::unique_ptr<PassConceptT>> AnalysisPassMapT;
/// \brief Collection of module analysis passes, indexed by ID.
AnalysisPassMapT AnalysisPasses;
/// \brief Map from function to a list of function analysis results.
///
/// Provides linear time removal of all analysis results for a function and
/// the ultimate storage for a particular cached analysis result.
AnalysisResultListMapT AnalysisResultLists;
/// \brief Map from an analysis ID and function to a particular cached
/// analysis result.
AnalysisResultMapT AnalysisResults;
/// \brief Indicates whether we log to \c llvm::dbgs().
bool DebugLogging;
};
extern template class AnalysisManager<Module>;
/// \brief Convenience typedef for the Module analysis manager.
typedef AnalysisManager<Module> ModuleAnalysisManager;
extern template class AnalysisManager<Function>;
/// \brief Convenience typedef for the Function analysis manager.
typedef AnalysisManager<Function> FunctionAnalysisManager;
/// \brief An analysis over an "outer" IR unit that provides access to an
/// analysis manager over an "inner" IR unit. The inner unit must be contained
/// in the outer unit.
///
/// Fore example, InnerAnalysisManagerProxy<FunctionAnalysisManager, Module> is
/// an analysis over Modules (the "outer" unit) that provides access to a
/// Function analysis manager. The FunctionAnalysisManager is the "inner"
/// manager being proxied, and Functions are the "inner" unit. The inner/outer
/// relationship is valid because each Function is contained in one Module.
///
/// If you're (transitively) within a pass manager for an IR unit U that
/// contains IR unit V, you should never use an analysis manager over V, except
/// via one of these proxies.
///
/// Note that the proxy's result is a move-only RAII object. The validity of
/// the analyses in the inner analysis manager is tied to its lifetime.
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
class InnerAnalysisManagerProxy
: public AnalysisInfoMixin<
InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>> {
public:
class Result {
public:
explicit Result(AnalysisManagerT &InnerAM) : InnerAM(&InnerAM) {}
Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)) {
// We have to null out the analysis manager in the moved-from state
// because we are taking ownership of the responsibilty to clear the
// analysis state.
Arg.InnerAM = nullptr;
}
Result &operator=(Result &&RHS) {
InnerAM = RHS.InnerAM;
// We have to null out the analysis manager in the moved-from state
// because we are taking ownership of the responsibilty to clear the
// analysis state.
RHS.InnerAM = nullptr;
return *this;
}
~Result() {
// InnerAM is cleared in a moved from state where there is nothing to do.
if (!InnerAM)
return;
// Clear out the analysis manager if we're being destroyed -- it means we
// didn't even see an invalidate call when we got invalidated.
InnerAM->clear();
}
/// \brief Accessor for the analysis manager.
AnalysisManagerT &getManager() { return *InnerAM; }
/// \brief Handler for invalidation of the outer IR unit, \c IRUnitT.
///
/// If the proxy analysis itself is not preserved, we assume that the set of
/// inner IR objects contained in IRUnit may have changed. In this case,
/// we have to call \c clear() on the inner analysis manager, as it may now
/// have stale pointers to its inner IR objects.
///
/// Regardless of whether the proxy analysis is marked as preserved, all of
/// the analyses in the inner analysis manager are potentially invalidated
/// based on the set of preserved analyses.
bool invalidate(
IRUnitT &IR, const PreservedAnalyses &PA,
typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &Inv);
private:
AnalysisManagerT *InnerAM;
};
explicit InnerAnalysisManagerProxy(AnalysisManagerT &InnerAM)
: InnerAM(&InnerAM) {}
/// \brief Run the analysis pass and create our proxy result object.
///
/// This doesn't do any interesting work; it is primarily used to insert our
/// proxy result object into the outer analysis cache so that we can proxy
/// invalidation to the inner analysis manager.
Result run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
ExtraArgTs...) {
return Result(*InnerAM);
}
private:
friend AnalysisInfoMixin<
InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
static AnalysisKey Key;
AnalysisManagerT *InnerAM;
};
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
AnalysisKey
InnerAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;
/// Provide the \c FunctionAnalysisManager to \c Module proxy.
typedef InnerAnalysisManagerProxy<FunctionAnalysisManager, Module>
FunctionAnalysisManagerModuleProxy;
/// Specialization of the invalidate method for the \c
/// FunctionAnalysisManagerModuleProxy's result.
template <>
bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
Module &M, const PreservedAnalyses &PA,
ModuleAnalysisManager::Invalidator &Inv);
// Ensure the \c FunctionAnalysisManagerModuleProxy is provided as an extern
// template.
extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
Module>;
/// \brief An analysis over an "inner" IR unit that provides access to an
/// analysis manager over a "outer" IR unit. The inner unit must be contained
/// in the outer unit.
///
/// For example OuterAnalysisManagerProxy<ModuleAnalysisManager, Function> is an
/// analysis over Functions (the "inner" unit) which provides access to a Module
/// analysis manager. The ModuleAnalysisManager is the "outer" manager being
/// proxied, and Modules are the "outer" IR unit. The inner/outer relationship
/// is valid because each Function is contained in one Module.
///
/// This proxy only exposes the const interface of the outer analysis manager,
/// to indicate that you cannot cause an outer analysis to run from within an
/// inner pass. Instead, you must rely on the \c getCachedResult API.
///
/// This proxy doesn't manage invalidation in any way -- that is handled by the
/// recursive return path of each layer of the pass manager. A consequence of
/// this is the outer analyses may be stale. We invalidate the outer analyses
/// only when we're done running passes over the inner IR units.
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
class OuterAnalysisManagerProxy
: public AnalysisInfoMixin<
- OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>> {
+ OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>> {
public:
/// \brief Result proxy object for \c OuterAnalysisManagerProxy.
class Result {
public:
explicit Result(const AnalysisManagerT &AM) : AM(&AM) {}
const AnalysisManagerT &getManager() const { return *AM; }
/// \brief Handle invalidation by ignoring it; this pass is immutable.
bool invalidate(
IRUnitT &, const PreservedAnalyses &,
typename AnalysisManager<IRUnitT, ExtraArgTs...>::Invalidator &) {
return false;
}
/// Register a deferred invalidation event for when the outer analysis
/// manager processes its invalidations.
template <typename OuterAnalysisT, typename InvalidatedAnalysisT>
void registerOuterAnalysisInvalidation() {
AnalysisKey *OuterID = OuterAnalysisT::ID();
AnalysisKey *InvalidatedID = InvalidatedAnalysisT::ID();
auto &InvalidatedIDList = OuterAnalysisInvalidationMap[OuterID];
// Note, this is a linear scan. If we end up with large numbers of
// analyses that all trigger invalidation on the same outer analysis,
// this entire system should be changed to some other deterministic
// data structure such as a `SetVector` of a pair of pointers.
auto InvalidatedIt = std::find(InvalidatedIDList.begin(),
InvalidatedIDList.end(), InvalidatedID);
if (InvalidatedIt == InvalidatedIDList.end())
InvalidatedIDList.push_back(InvalidatedID);
}
/// Access the map from outer analyses to deferred invalidation requiring
/// analyses.
const SmallDenseMap<AnalysisKey *, TinyPtrVector<AnalysisKey *>, 2> &
getOuterInvalidations() const {
return OuterAnalysisInvalidationMap;
}
private:
const AnalysisManagerT *AM;
/// A map from an outer analysis ID to the set of this IR-unit's analyses
/// which need to be invalidated.
SmallDenseMap<AnalysisKey *, TinyPtrVector<AnalysisKey *>, 2>
OuterAnalysisInvalidationMap;
};
OuterAnalysisManagerProxy(const AnalysisManagerT &AM) : AM(&AM) {}
/// \brief Run the analysis pass and create our proxy result object.
/// Nothing to see here, it just forwards the \c AM reference into the
/// result.
Result run(IRUnitT &, AnalysisManager<IRUnitT, ExtraArgTs...> &,
ExtraArgTs...) {
return Result(*AM);
}
private:
friend AnalysisInfoMixin<
- OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
+ OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>>;
static AnalysisKey Key;
const AnalysisManagerT *AM;
};
template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
AnalysisKey
OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>::Key;
extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
Function>;
/// Provide the \c ModuleAnalysisManager to \c Function proxy.
typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, Function>
ModuleAnalysisManagerFunctionProxy;
/// \brief Trivial adaptor that maps from a module to its functions.
///
/// Designed to allow composition of a FunctionPass(Manager) and
/// a ModulePassManager, by running the FunctionPass(Manager) over every
/// function in the module.
///
/// Function passes run within this adaptor can rely on having exclusive access
/// to the function they are run over. They should not read or modify any other
/// functions! Other threads or systems may be manipulating other functions in
/// the module, and so their state should never be relied on.
/// FIXME: Make the above true for all of LLVM's actual passes, some still
/// violate this principle.
///
/// Function passes can also read the module containing the function, but they
/// should not modify that module outside of the use lists of various globals.
/// For example, a function pass is not permitted to add functions to the
/// module.
/// FIXME: Make the above true for all of LLVM's actual passes, some still
/// violate this principle.
///
/// Note that although function passes can access module analyses, module
/// analyses are not invalidated while the function passes are running, so they
/// may be stale. Function analyses will not be stale.
template <typename FunctionPassT>
class ModuleToFunctionPassAdaptor
: public PassInfoMixin<ModuleToFunctionPassAdaptor<FunctionPassT>> {
public:
explicit ModuleToFunctionPassAdaptor(FunctionPassT Pass)
: Pass(std::move(Pass)) {}
/// \brief Runs the function pass across every function in the module.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM) {
FunctionAnalysisManager &FAM =
AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
PreservedAnalyses PA = PreservedAnalyses::all();
for (Function &F : M) {
if (F.isDeclaration())
continue;
PreservedAnalyses PassPA = Pass.run(F, FAM);
// We know that the function pass couldn't have invalidated any other
// function's analyses (that's the contract of a function pass), so
// directly handle the function analysis manager's invalidation here.
FAM.invalidate(F, PassPA);
// Then intersect the preserved set so that invalidation of module
// analyses will eventually occur when the module pass completes.
PA.intersect(std::move(PassPA));
}
// The FunctionAnalysisManagerModuleProxy is preserved because (we assume)
// the function passes we ran didn't add or remove any functions.
//
// We also preserve all analyses on Functions, because we did all the
// invalidation we needed to do above.
PA.preserveSet<AllAnalysesOn<Function>>();
PA.preserve<FunctionAnalysisManagerModuleProxy>();
return PA;
}
private:
FunctionPassT Pass;
};
/// \brief A function to deduce a function pass type and wrap it in the
/// templated adaptor.
template <typename FunctionPassT>
ModuleToFunctionPassAdaptor<FunctionPassT>
createModuleToFunctionPassAdaptor(FunctionPassT Pass) {
return ModuleToFunctionPassAdaptor<FunctionPassT>(std::move(Pass));
}
/// \brief A utility pass template to force an analysis result to be available.
///
/// If there are extra arguments at the pass's run level there may also be
/// extra arguments to the analysis manager's \c getResult routine. We can't
/// guess how to effectively map the arguments from one to the other, and so
/// this specialization just ignores them.
///
/// Specific patterns of run-method extra arguments and analysis manager extra
/// arguments will have to be defined as appropriate specializations.
template <typename AnalysisT, typename IRUnitT,
typename AnalysisManagerT = AnalysisManager<IRUnitT>,
typename... ExtraArgTs>
struct RequireAnalysisPass
: PassInfoMixin<RequireAnalysisPass<AnalysisT, IRUnitT, AnalysisManagerT,
ExtraArgTs...>> {
/// \brief Run this pass over some unit of IR.
///
/// This pass can be run over any unit of IR and use any analysis manager
/// provided they satisfy the basic API requirements. When this pass is
/// created, these methods can be instantiated to satisfy whatever the
/// context requires.
PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM,
ExtraArgTs &&... Args) {
(void)AM.template getResult<AnalysisT>(Arg,
std::forward<ExtraArgTs>(Args)...);
return PreservedAnalyses::all();
}
};
/// \brief A no-op pass template which simply forces a specific analysis result
/// to be invalidated.
template <typename AnalysisT>
struct InvalidateAnalysisPass
: PassInfoMixin<InvalidateAnalysisPass<AnalysisT>> {
/// \brief Run this pass over some unit of IR.
///
/// This pass can be run over any unit of IR and use any analysis manager,
/// provided they satisfy the basic API requirements. When this pass is
/// created, these methods can be instantiated to satisfy whatever the
/// context requires.
template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM, ExtraArgTs &&...) {
auto PA = PreservedAnalyses::all();
PA.abandon<AnalysisT>();
return PA;
}
};
/// \brief A utility pass that does nothing, but preserves no analyses.
///
/// Because this preserves no analyses, any analysis passes queried after this
/// pass runs will recompute fresh results.
struct InvalidateAllAnalysesPass : PassInfoMixin<InvalidateAllAnalysesPass> {
/// \brief Run this pass over some unit of IR.
template <typename IRUnitT, typename AnalysisManagerT, typename... ExtraArgTs>
PreservedAnalyses run(IRUnitT &, AnalysisManagerT &, ExtraArgTs &&...) {
return PreservedAnalyses::none();
}
};
/// A utility pass template that simply runs another pass multiple times.
///
/// This can be useful when debugging or testing passes. It also serves as an
/// example of how to extend the pass manager in ways beyond composition.
template <typename PassT>
class RepeatedPass : public PassInfoMixin<RepeatedPass<PassT>> {
public:
RepeatedPass(int Count, PassT P) : Count(Count), P(std::move(P)) {}
template <typename IRUnitT, typename AnalysisManagerT, typename... Ts>
PreservedAnalyses run(IRUnitT &Arg, AnalysisManagerT &AM, Ts &&... Args) {
auto PA = PreservedAnalyses::all();
for (int i = 0; i < Count; ++i)
PA.intersect(P.run(Arg, AM, std::forward<Ts>(Args)...));
return PA;
}
private:
int Count;
PassT P;
};
template <typename PassT>
RepeatedPass<PassT> createRepeatedPass(int Count, PassT P) {
return RepeatedPass<PassT>(Count, std::move(P));
}
}
#endif
Index: projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h
===================================================================
--- projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h (revision 313642)
+++ projects/clang400-import/contrib/llvm/include/llvm/Target/TargetInstrInfo.h (revision 313643)
@@ -1,1548 +1,1529 @@
//===-- llvm/Target/TargetInstrInfo.h - Instruction Info --------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file describes the target machine instruction set to the code generator.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_TARGET_TARGETINSTRINFO_H
#define LLVM_TARGET_TARGETINSTRINFO_H
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineCombinerPattern.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
namespace llvm {
class InstrItineraryData;
class LiveVariables;
class MCAsmInfo;
class MachineMemOperand;
class MachineRegisterInfo;
class MDNode;
class MCInst;
struct MCSchedModel;
class MCSymbolRefExpr;
class SDNode;
class ScheduleHazardRecognizer;
class SelectionDAG;
class ScheduleDAG;
class TargetRegisterClass;
class TargetRegisterInfo;
class TargetSubtargetInfo;
class TargetSchedModel;
class DFAPacketizer;
template<class T> class SmallVectorImpl;
//---------------------------------------------------------------------------
///
/// TargetInstrInfo - Interface to description of machine instruction set
///
class TargetInstrInfo : public MCInstrInfo {
TargetInstrInfo(const TargetInstrInfo &) = delete;
void operator=(const TargetInstrInfo &) = delete;
public:
TargetInstrInfo(unsigned CFSetupOpcode = ~0u, unsigned CFDestroyOpcode = ~0u,
unsigned CatchRetOpcode = ~0u, unsigned ReturnOpcode = ~0u)
: CallFrameSetupOpcode(CFSetupOpcode),
CallFrameDestroyOpcode(CFDestroyOpcode),
CatchRetOpcode(CatchRetOpcode),
ReturnOpcode(ReturnOpcode) {}
virtual ~TargetInstrInfo();
static bool isGenericOpcode(unsigned Opc) {
return Opc <= TargetOpcode::GENERIC_OP_END;
}
/// Given a machine instruction descriptor, returns the register
/// class constraint for OpNum, or NULL.
const TargetRegisterClass *getRegClass(const MCInstrDesc &TID,
unsigned OpNum,
const TargetRegisterInfo *TRI,
const MachineFunction &MF) const;
/// Return true if the instruction is trivially rematerializable, meaning it
/// has no side effects and requires no operands that aren't always available.
/// This means the only allowed uses are constants and unallocatable physical
/// registers so that the instructions result is independent of the place
/// in the function.
bool isTriviallyReMaterializable(const MachineInstr &MI,
AliasAnalysis *AA = nullptr) const {
return MI.getOpcode() == TargetOpcode::IMPLICIT_DEF ||
(MI.getDesc().isRematerializable() &&
(isReallyTriviallyReMaterializable(MI, AA) ||
isReallyTriviallyReMaterializableGeneric(MI, AA)));
}
protected:
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
/// set, this hook lets the target specify whether the instruction is actually
/// trivially rematerializable, taking into consideration its operands. This
/// predicate must return false if the instruction has any side effects other
/// than producing a value, or if it requres any address registers that are
/// not always available.
/// Requirements must be check as stated in isTriviallyReMaterializable() .
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AliasAnalysis *AA) const {
return false;
}
/// This method commutes the operands of the given machine instruction MI.
/// The operands to be commuted are specified by their indices OpIdx1 and
/// OpIdx2.
///
/// If a target has any instructions that are commutable but require
/// converting to different instructions or making non-trivial changes
/// to commute them, this method can be overloaded to do that.
/// The default implementation simply swaps the commutable operands.
///
/// If NewMI is false, MI is modified in place and returned; otherwise, a
/// new machine instruction is created and returned.
///
/// Do not call this method for a non-commutable instruction.
/// Even though the instruction is commutable, the method may still
/// fail to commute the operands, null pointer is returned in such cases.
virtual MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const;
/// Assigns the (CommutableOpIdx1, CommutableOpIdx2) pair of commutable
/// operand indices to (ResultIdx1, ResultIdx2).
/// One or both input values of the pair: (ResultIdx1, ResultIdx2) may be
/// predefined to some indices or be undefined (designated by the special
/// value 'CommuteAnyOperandIndex').
/// The predefined result indices cannot be re-defined.
/// The function returns true iff after the result pair redefinition
/// the fixed result pair is equal to or equivalent to the source pair of
/// indices: (CommutableOpIdx1, CommutableOpIdx2). It is assumed here that
/// the pairs (x,y) and (y,x) are equivalent.
static bool fixCommutedOpIndices(unsigned &ResultIdx1,
unsigned &ResultIdx2,
unsigned CommutableOpIdx1,
unsigned CommutableOpIdx2);
private:
/// For instructions with opcodes for which the M_REMATERIALIZABLE flag is
/// set and the target hook isReallyTriviallyReMaterializable returns false,
/// this function does target-independent tests to determine if the
/// instruction is really trivially rematerializable.
bool isReallyTriviallyReMaterializableGeneric(const MachineInstr &MI,
AliasAnalysis *AA) const;
public:
/// These methods return the opcode of the frame setup/destroy instructions
/// if they exist (-1 otherwise). Some targets use pseudo instructions in
/// order to abstract away the difference between operating with a frame
/// pointer and operating without, through the use of these two instructions.
///
unsigned getCallFrameSetupOpcode() const { return CallFrameSetupOpcode; }
unsigned getCallFrameDestroyOpcode() const { return CallFrameDestroyOpcode; }
unsigned getCatchReturnOpcode() const { return CatchRetOpcode; }
unsigned getReturnOpcode() const { return ReturnOpcode; }
/// Returns the actual stack pointer adjustment made by an instruction
/// as part of a call sequence. By default, only call frame setup/destroy
/// instructions adjust the stack, but targets may want to override this
/// to enable more fine-grained adjustment, or adjust by a different value.
virtual int getSPAdjust(const MachineInstr &MI) const;
/// Return true if the instruction is a "coalescable" extension instruction.
/// That is, it's like a copy where it's legal for the source to overlap the
/// destination. e.g. X86::MOVSX64rr32. If this returns true, then it's
/// expected the pre-extension value is available as a subreg of the result
/// register. This also returns the sub-register index in SubIdx.
virtual bool isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const {
return false;
}
/// If the specified machine instruction is a direct
/// load from a stack slot, return the virtual or physical register number of
/// the destination along with the FrameIndex of the loaded stack slot. If
/// not, return 0. This predicate must return 0 if the instruction has
/// any side effects other than loading from the stack slot.
virtual unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
return 0;
}
/// Check for post-frame ptr elimination stack locations as well.
/// This uses a heuristic so it isn't reliable for correctness.
virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
return 0;
}
/// If the specified machine instruction has a load from a stack slot,
/// return true along with the FrameIndex of the loaded stack slot and the
/// machine mem operand containing the reference.
/// If not, return false. Unlike isLoadFromStackSlot, this returns true for
/// any instructions that loads from the stack. This is just a hint, as some
/// cases may be missed.
virtual bool hasLoadFromStackSlot(const MachineInstr &MI,
const MachineMemOperand *&MMO,
int &FrameIndex) const;
/// If the specified machine instruction is a direct
/// store to a stack slot, return the virtual or physical register number of
/// the source reg along with the FrameIndex of the loaded stack slot. If
/// not, return 0. This predicate must return 0 if the instruction has
/// any side effects other than storing to the stack slot.
virtual unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
return 0;
}
/// Check for post-frame ptr elimination stack locations as well.
/// This uses a heuristic, so it isn't reliable for correctness.
virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
return 0;
}
/// If the specified machine instruction has a store to a stack slot,
/// return true along with the FrameIndex of the loaded stack slot and the
/// machine mem operand containing the reference.
/// If not, return false. Unlike isStoreToStackSlot,
/// this returns true for any instructions that stores to the
/// stack. This is just a hint, as some cases may be missed.
virtual bool hasStoreToStackSlot(const MachineInstr &MI,
const MachineMemOperand *&MMO,
int &FrameIndex) const;
/// Return true if the specified machine instruction
/// is a copy of one stack slot to another and has no other effect.
/// Provide the identity of the two frame indices.
virtual bool isStackSlotCopy(const MachineInstr &MI, int &DestFrameIndex,
int &SrcFrameIndex) const {
return false;
}
/// Compute the size in bytes and offset within a stack slot of a spilled
/// register or subregister.
///
/// \param [out] Size in bytes of the spilled value.
/// \param [out] Offset in bytes within the stack slot.
/// \returns true if both Size and Offset are successfully computed.
///
/// Not all subregisters have computable spill slots. For example,
/// subregisters registers may not be byte-sized, and a pair of discontiguous
/// subregisters has no single offset.
///
/// Targets with nontrivial bigendian implementations may need to override
/// this, particularly to support spilled vector registers.
virtual bool getStackSlotRange(const TargetRegisterClass *RC, unsigned SubIdx,
unsigned &Size, unsigned &Offset,
const MachineFunction &MF) const;
/// Returns the size in bytes of the specified MachineInstr, or ~0U
/// when this function is not implemented by a target.
virtual unsigned getInstSizeInBytes(const MachineInstr &MI) const {
return ~0U;
}
/// Return true if the instruction is as cheap as a move instruction.
///
/// Targets for different archs need to override this, and different
/// micro-architectures can also be finely tuned inside.
virtual bool isAsCheapAsAMove(const MachineInstr &MI) const {
return MI.isAsCheapAsAMove();
}
/// Return true if the instruction should be sunk by MachineSink.
///
/// MachineSink determines on its own whether the instruction is safe to sink;
/// this gives the target a hook to override the default behavior with regards
/// to which instructions should be sunk.
virtual bool shouldSink(const MachineInstr &MI) const {
return true;
}
/// Re-issue the specified 'original' instruction at the
/// specific location targeting a new destination register.
/// The register in Orig->getOperand(0).getReg() will be substituted by
/// DestReg:SubIdx. Any existing subreg index is preserved or composed with
/// SubIdx.
virtual void reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned DestReg,
unsigned SubIdx, const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const;
/// Create a duplicate of the Orig instruction in MF. This is like
/// MachineFunction::CloneMachineInstr(), but the target may update operands
/// that are required to be unique.
///
/// The instruction must be duplicable as indicated by isNotDuplicable().
virtual MachineInstr *duplicate(MachineInstr &Orig,
MachineFunction &MF) const;
/// This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
/// may be able to convert a two-address instruction into one or more true
/// three-address instructions on demand. This allows the X86 target (for
/// example) to convert ADD and SHL instructions into LEA instructions if they
/// would require register copies due to two-addressness.
///
/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the last new instruction.
///
virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
MachineInstr &MI,
LiveVariables *LV) const {
return nullptr;
}
// This constant can be used as an input value of operand index passed to
// the method findCommutedOpIndices() to tell the method that the
// corresponding operand index is not pre-defined and that the method
// can pick any commutable operand.
static const unsigned CommuteAnyOperandIndex = ~0U;
/// This method commutes the operands of the given machine instruction MI.
///
/// The operands to be commuted are specified by their indices OpIdx1 and
/// OpIdx2. OpIdx1 and OpIdx2 arguments may be set to a special value
/// 'CommuteAnyOperandIndex', which means that the method is free to choose
/// any arbitrarily chosen commutable operand. If both arguments are set to
/// 'CommuteAnyOperandIndex' then the method looks for 2 different commutable
/// operands; then commutes them if such operands could be found.
///
/// If NewMI is false, MI is modified in place and returned; otherwise, a
/// new machine instruction is created and returned.
///
/// Do not call this method for a non-commutable instruction or
/// for non-commuable operands.
/// Even though the instruction is commutable, the method may still
/// fail to commute the operands, null pointer is returned in such cases.
MachineInstr *
commuteInstruction(MachineInstr &MI, bool NewMI = false,
unsigned OpIdx1 = CommuteAnyOperandIndex,
unsigned OpIdx2 = CommuteAnyOperandIndex) const;
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments.
/// If any of the INPUT values is set to the special value
/// 'CommuteAnyOperandIndex' then the method arbitrarily picks a commutable
/// operand, then returns its index in the corresponding argument.
/// If both of INPUT values are set to 'CommuteAnyOperandIndex' then method
/// looks for 2 commutable operands.
/// If INPUT values refer to some operands of MI, then the method simply
/// returns true if the corresponding operands are commutable and returns
/// false otherwise.
///
/// For example, calling this method this way:
/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
virtual bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const;
/// A pair composed of a register and a sub-register index.
/// Used to give some type checking when modeling Reg:SubReg.
struct RegSubRegPair {
unsigned Reg;
unsigned SubReg;
RegSubRegPair(unsigned Reg = 0, unsigned SubReg = 0)
: Reg(Reg), SubReg(SubReg) {}
};
/// A pair composed of a pair of a register and a sub-register index,
/// and another sub-register index.
/// Used to give some type checking when modeling Reg:SubReg1, SubReg2.
struct RegSubRegPairAndIdx : RegSubRegPair {
unsigned SubIdx;
RegSubRegPairAndIdx(unsigned Reg = 0, unsigned SubReg = 0,
unsigned SubIdx = 0)
: RegSubRegPair(Reg, SubReg), SubIdx(SubIdx) {}
};
/// Build the equivalent inputs of a REG_SEQUENCE for the given \p MI
/// and \p DefIdx.
/// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
/// the list is modeled as <Reg:SubReg, SubIdx>.
/// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
/// two elements:
/// - vreg1:sub1, sub0
/// - vreg2<:0>, sub1
///
/// \returns true if it is possible to build such an input sequence
/// with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isRegSequence() or MI.isRegSequenceLike().
///
/// \note The generic implementation does not provide any support for
/// MI.isRegSequenceLike(). In other words, one has to override
/// getRegSequenceLikeInputs for target specific instructions.
bool
getRegSequenceInputs(const MachineInstr &MI, unsigned DefIdx,
SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const;
/// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
/// and \p DefIdx.
/// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
/// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
/// - vreg1:sub1, sub0
///
/// \returns true if it is possible to build such an input sequence
/// with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isExtractSubreg() or MI.isExtractSubregLike().
///
/// \note The generic implementation does not provide any support for
/// MI.isExtractSubregLike(). In other words, one has to override
/// getExtractSubregLikeInputs for target specific instructions.
bool
getExtractSubregInputs(const MachineInstr &MI, unsigned DefIdx,
RegSubRegPairAndIdx &InputReg) const;
/// Build the equivalent inputs of a INSERT_SUBREG for the given \p MI
/// and \p DefIdx.
/// \p [out] BaseReg and \p [out] InsertedReg contain
/// the equivalent inputs of INSERT_SUBREG.
/// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
/// - BaseReg: vreg0:sub0
/// - InsertedReg: vreg1:sub1, sub3
///
/// \returns true if it is possible to build such an input sequence
/// with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isInsertSubreg() or MI.isInsertSubregLike().
///
/// \note The generic implementation does not provide any support for
/// MI.isInsertSubregLike(). In other words, one has to override
/// getInsertSubregLikeInputs for target specific instructions.
bool
getInsertSubregInputs(const MachineInstr &MI, unsigned DefIdx,
RegSubRegPair &BaseReg,
RegSubRegPairAndIdx &InsertedReg) const;
/// Return true if two machine instructions would produce identical values.
/// By default, this is only true when the two instructions
/// are deemed identical except for defs. If this function is called when the
/// IR is still in SSA form, the caller can pass the MachineRegisterInfo for
/// aggressive checks.
virtual bool produceSameValue(const MachineInstr &MI0,
const MachineInstr &MI1,
const MachineRegisterInfo *MRI = nullptr) const;
/// \returns true if a branch from an instruction with opcode \p BranchOpc
/// bytes is capable of jumping to a position \p BrOffset bytes away.
virtual bool isBranchOffsetInRange(unsigned BranchOpc,
int64_t BrOffset) const {
llvm_unreachable("target did not implement");
}
/// \returns The block that branch instruction \p MI jumps to.
virtual MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const {
llvm_unreachable("target did not implement");
}
/// Insert an unconditional indirect branch at the end of \p MBB to \p
/// NewDestBB. \p BrOffset indicates the offset of \p NewDestBB relative to
/// the offset of the position to insert the new branch.
///
/// \returns The number of bytes added to the block.
virtual unsigned insertIndirectBranch(MachineBasicBlock &MBB,
MachineBasicBlock &NewDestBB,
const DebugLoc &DL,
int64_t BrOffset = 0,
RegScavenger *RS = nullptr) const {
llvm_unreachable("target did not implement");
}
/// Analyze the branching code at the end of MBB, returning
/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
/// implemented for a target). Upon success, this returns false and returns
/// with the following information in various cases:
///
/// 1. If this block ends with no branches (it just falls through to its succ)
/// just return false, leaving TBB/FBB null.
/// 2. If this block ends with only an unconditional branch, it sets TBB to be
/// the destination block.
/// 3. If this block ends with a conditional branch and it falls through to a
/// successor block, it sets TBB to be the branch destination block and a
/// list of operands that evaluate the condition. These operands can be
/// passed to other TargetInstrInfo methods to create new branches.
/// 4. If this block ends with a conditional branch followed by an
/// unconditional branch, it returns the 'true' destination in TBB, the
/// 'false' destination in FBB, and a list of operands that evaluate the
/// condition. These operands can be passed to other TargetInstrInfo
/// methods to create new branches.
///
/// Note that removeBranch and insertBranch must be implemented to support
/// cases where this method returns success.
///
/// If AllowModify is true, then this routine is allowed to modify the basic
/// block (e.g. delete instructions after the unconditional branch).
///
/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
/// before calling this function.
virtual bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify = false) const {
return true;
}
/// Represents a predicate at the MachineFunction level. The control flow a
/// MachineBranchPredicate represents is:
///
/// Reg <def>= LHS `Predicate` RHS == ConditionDef
/// if Reg then goto TrueDest else goto FalseDest
///
struct MachineBranchPredicate {
enum ComparePredicate {
PRED_EQ, // True if two values are equal
PRED_NE, // True if two values are not equal
PRED_INVALID // Sentinel value
};
ComparePredicate Predicate;
MachineOperand LHS;
MachineOperand RHS;
MachineBasicBlock *TrueDest;
MachineBasicBlock *FalseDest;
MachineInstr *ConditionDef;
/// SingleUseCondition is true if ConditionDef is dead except for the
/// branch(es) at the end of the basic block.
///
bool SingleUseCondition;
explicit MachineBranchPredicate()
: Predicate(PRED_INVALID), LHS(MachineOperand::CreateImm(0)),
RHS(MachineOperand::CreateImm(0)), TrueDest(nullptr),
FalseDest(nullptr), ConditionDef(nullptr), SingleUseCondition(false) {
}
};
/// Analyze the branching code at the end of MBB and parse it into the
/// MachineBranchPredicate structure if possible. Returns false on success
/// and true on failure.
///
/// If AllowModify is true, then this routine is allowed to modify the basic
/// block (e.g. delete instructions after the unconditional branch).
///
virtual bool analyzeBranchPredicate(MachineBasicBlock &MBB,
MachineBranchPredicate &MBP,
bool AllowModify = false) const {
return true;
}
/// Remove the branching code at the end of the specific MBB.
/// This is only invoked in cases where AnalyzeBranch returns success. It
/// returns the number of instructions that were removed.
/// If \p BytesRemoved is non-null, report the change in code size from the
/// removed instructions.
virtual unsigned removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved = nullptr) const {
llvm_unreachable("Target didn't implement TargetInstrInfo::removeBranch!");
}
/// Insert branch code into the end of the specified MachineBasicBlock. The
/// operands to this method are the same as those returned by AnalyzeBranch.
/// This is only invoked in cases where AnalyzeBranch returns success. It
/// returns the number of instructions inserted. If \p BytesAdded is non-null,
/// report the change in code size from the added instructions.
///
/// It is also invoked by tail merging to add unconditional branches in
/// cases where AnalyzeBranch doesn't apply because there was no original
/// branch to analyze. At least this much must be implemented, else tail
/// merging needs to be disabled.
///
/// The CFG information in MBB.Predecessors and MBB.Successors must be valid
/// before calling this function.
virtual unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const {
llvm_unreachable("Target didn't implement TargetInstrInfo::insertBranch!");
}
unsigned insertUnconditionalBranch(MachineBasicBlock &MBB,
MachineBasicBlock *DestBB,
const DebugLoc &DL,
int *BytesAdded = nullptr) const {
return insertBranch(MBB, DestBB, nullptr,
ArrayRef<MachineOperand>(), DL, BytesAdded);
}
/// Analyze the loop code, return true if it cannot be understoo. Upon
/// success, this function returns false and returns information about the
/// induction variable and compare instruction used at the end.
virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
MachineInstr *&CmpInst) const {
return true;
}
/// Generate code to reduce the loop iteration by one and check if the loop is
/// finished. Return the value/register of the the new loop count. We need
/// this function when peeling off one or more iterations of a loop. This
/// function assumes the nth iteration is peeled first.
virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
MachineInstr *IndVar, MachineInstr &Cmp,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &PrevInsts,
unsigned Iter, unsigned MaxIter) const {
llvm_unreachable("Target didn't implement ReduceLoopCount");
}
/// Delete the instruction OldInst and everything after it, replacing it with
/// an unconditional branch to NewDest. This is used by the tail merging pass.
virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
MachineBasicBlock *NewDest) const;
/// Return true if it's legal to split the given basic
/// block at the specified instruction (i.e. instruction would be the start
/// of a new basic block).
virtual bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
return true;
}
/// Return true if it's profitable to predicate
/// instructions with accumulated instruction latency of "NumCycles"
/// of the specified basic block, where the probability of the instructions
/// being executed is given by Probability, and Confidence is a measure
/// of our confidence that it will be properly predicted.
virtual
bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
unsigned ExtraPredCycles,
BranchProbability Probability) const {
return false;
}
/// Second variant of isProfitableToIfCvt. This one
/// checks for the case where two basic blocks from true and false path
/// of a if-then-else (diamond) are predicated on mutally exclusive
/// predicates, where the probability of the true path being taken is given
/// by Probability, and Confidence is a measure of our confidence that it
/// will be properly predicted.
virtual bool
isProfitableToIfCvt(MachineBasicBlock &TMBB,
unsigned NumTCycles, unsigned ExtraTCycles,
MachineBasicBlock &FMBB,
unsigned NumFCycles, unsigned ExtraFCycles,
BranchProbability Probability) const {
return false;
}
/// Return true if it's profitable for if-converter to duplicate instructions
/// of specified accumulated instruction latencies in the specified MBB to
/// enable if-conversion.
/// The probability of the instructions being executed is given by
/// Probability, and Confidence is a measure of our confidence that it
/// will be properly predicted.
virtual bool
isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
BranchProbability Probability) const {
return false;
}
/// Return true if it's profitable to unpredicate
/// one side of a 'diamond', i.e. two sides of if-else predicated on mutually
/// exclusive predicates.
/// e.g.
/// subeq r0, r1, #1
/// addne r0, r1, #1
/// =>
/// sub r0, r1, #1
/// addne r0, r1, #1
///
/// This may be profitable is conditional instructions are always executed.
virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
MachineBasicBlock &FMBB) const {
return false;
}
/// Return true if it is possible to insert a select
/// instruction that chooses between TrueReg and FalseReg based on the
/// condition code in Cond.
///
/// When successful, also return the latency in cycles from TrueReg,
/// FalseReg, and Cond to the destination register. In most cases, a select
/// instruction will be 1 cycle, so CondCycles = TrueCycles = FalseCycles = 1
///
/// Some x86 implementations have 2-cycle cmov instructions.
///
/// @param MBB Block where select instruction would be inserted.
/// @param Cond Condition returned by AnalyzeBranch.
/// @param TrueReg Virtual register to select when Cond is true.
/// @param FalseReg Virtual register to select when Cond is false.
/// @param CondCycles Latency from Cond+Branch to select output.
/// @param TrueCycles Latency from TrueReg to select output.
/// @param FalseCycles Latency from FalseReg to select output.
virtual bool canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg,
int &CondCycles,
int &TrueCycles, int &FalseCycles) const {
return false;
}
/// Insert a select instruction into MBB before I that will copy TrueReg to
/// DstReg when Cond is true, and FalseReg to DstReg when Cond is false.
///
/// This function can only be called after canInsertSelect() returned true.
/// The condition in Cond comes from AnalyzeBranch, and it can be assumed
/// that the same flags or registers required by Cond are available at the
/// insertion point.
///
/// @param MBB Block where select instruction should be inserted.
/// @param I Insertion point.
/// @param DL Source location for debugging.
/// @param DstReg Virtual register to be defined by select instruction.
/// @param Cond Condition as computed by AnalyzeBranch.
/// @param TrueReg Virtual register to copy when Cond is true.
/// @param FalseReg Virtual register to copy when Cons is false.
virtual void insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
unsigned DstReg, ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg) const {
llvm_unreachable("Target didn't implement TargetInstrInfo::insertSelect!");
}
/// Analyze the given select instruction, returning true if
/// it cannot be understood. It is assumed that MI->isSelect() is true.
///
/// When successful, return the controlling condition and the operands that
/// determine the true and false result values.
///
/// Result = SELECT Cond, TrueOp, FalseOp
///
/// Some targets can optimize select instructions, for example by predicating
/// the instruction defining one of the operands. Such targets should set
/// Optimizable.
///
/// @param MI Select instruction to analyze.
/// @param Cond Condition controlling the select.
/// @param TrueOp Operand number of the value selected when Cond is true.
/// @param FalseOp Operand number of the value selected when Cond is false.
/// @param Optimizable Returned as true if MI is optimizable.
/// @returns False on success.
virtual bool analyzeSelect(const MachineInstr &MI,
SmallVectorImpl<MachineOperand> &Cond,
unsigned &TrueOp, unsigned &FalseOp,
bool &Optimizable) const {
assert(MI.getDesc().isSelect() && "MI must be a select instruction");
return true;
}
/// Given a select instruction that was understood by
/// analyzeSelect and returned Optimizable = true, attempt to optimize MI by
/// merging it with one of its operands. Returns NULL on failure.
///
/// When successful, returns the new select instruction. The client is
/// responsible for deleting MI.
///
/// If both sides of the select can be optimized, PreferFalse is used to pick
/// a side.
///
/// @param MI Optimizable select instruction.
/// @param NewMIs Set that record all MIs in the basic block up to \p
/// MI. Has to be updated with any newly created MI or deleted ones.
/// @param PreferFalse Try to optimize FalseOp instead of TrueOp.
/// @returns Optimized instruction or NULL.
virtual MachineInstr *optimizeSelect(MachineInstr &MI,
SmallPtrSetImpl<MachineInstr *> &NewMIs,
bool PreferFalse = false) const {
// This function must be implemented if Optimizable is ever set.
llvm_unreachable("Target must implement TargetInstrInfo::optimizeSelect!");
}
/// Emit instructions to copy a pair of physical registers.
///
/// This function should support copies within any legal register class as
/// well as any cross-class copies created during instruction selection.
///
/// The source and destination registers may overlap, which may require a
/// careful implementation when multiple copy instructions are required for
/// large registers. See for example the ARM target.
virtual void copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, const DebugLoc &DL,
unsigned DestReg, unsigned SrcReg,
bool KillSrc) const {
llvm_unreachable("Target didn't implement TargetInstrInfo::copyPhysReg!");
}
/// Store the specified register of the given register class to the specified
/// stack frame index. The store instruction is to be added to the given
/// machine basic block before the specified machine instruction. If isKill
/// is true, the register operand is the last use and must be marked kill.
virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
llvm_unreachable("Target didn't implement "
"TargetInstrInfo::storeRegToStackSlot!");
}
/// Load the specified register of the given register class from the specified
/// stack frame index. The load instruction is to be added to the given
/// machine basic block before the specified machine instruction.
virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
llvm_unreachable("Target didn't implement "
"TargetInstrInfo::loadRegFromStackSlot!");
}
/// This function is called for all pseudo instructions
/// that remain after register allocation. Many pseudo instructions are
/// created to help register allocation. This is the place to convert them
/// into real instructions. The target can edit MI in place, or it can insert
/// new instructions and erase MI. The function should return true if
/// anything was changed.
virtual bool expandPostRAPseudo(MachineInstr &MI) const { return false; }
/// Check whether the target can fold a load that feeds a subreg operand
/// (or a subreg operand that feeds a store).
/// For example, X86 may want to return true if it can fold
/// movl (%esp), %eax
/// subb, %al, ...
/// Into:
/// subb (%esp), ...
///
/// Ideally, we'd like the target implementation of foldMemoryOperand() to
/// reject subregs - but since this behavior used to be enforced in the
/// target-independent code, moving this responsibility to the targets
/// has the potential of causing nasty silent breakage in out-of-tree targets.
virtual bool isSubregFoldable() const { return false; }
/// Attempt to fold a load or store of the specified stack
/// slot into the specified machine instruction for the specified operand(s).
/// If this is possible, a new instruction is returned with the specified
/// operand folded, otherwise NULL is returned.
/// The new instruction is inserted before MI, and the client is responsible
/// for removing the old instruction.
MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
int FrameIndex,
LiveIntervals *LIS = nullptr) const;
/// Same as the previous version except it allows folding of any load and
/// store from / to any address, not just from a specific stack slot.
MachineInstr *foldMemoryOperand(MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineInstr &LoadMI,
LiveIntervals *LIS = nullptr) const;
/// Return true when there is potentially a faster code sequence
/// for an instruction chain ending in \p Root. All potential patterns are
/// returned in the \p Pattern vector. Pattern should be sorted in priority
/// order since the pattern evaluator stops checking as soon as it finds a
/// faster sequence.
/// \param Root - Instruction that could be combined with one of its operands
/// \param Patterns - Vector of possible combination patterns
virtual bool getMachineCombinerPatterns(
MachineInstr &Root,
SmallVectorImpl<MachineCombinerPattern> &Patterns) const;
/// Return true when a code sequence can improve throughput. It
/// should be called only for instructions in loops.
/// \param Pattern - combiner pattern
virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const;
/// Return true if the input \P Inst is part of a chain of dependent ops
/// that are suitable for reassociation, otherwise return false.
/// If the instruction's operands must be commuted to have a previous
/// instruction of the same type define the first source operand, \P Commuted
/// will be set to true.
bool isReassociationCandidate(const MachineInstr &Inst, bool &Commuted) const;
/// Return true when \P Inst is both associative and commutative.
virtual bool isAssociativeAndCommutative(const MachineInstr &Inst) const {
return false;
}
/// Return true when \P Inst has reassociable operands in the same \P MBB.
virtual bool hasReassociableOperands(const MachineInstr &Inst,
const MachineBasicBlock *MBB) const;
/// Return true when \P Inst has reassociable sibling.
bool hasReassociableSibling(const MachineInstr &Inst, bool &Commuted) const;
/// When getMachineCombinerPatterns() finds patterns, this function generates
/// the instructions that could replace the original code sequence. The client
/// has to decide whether the actual replacement is beneficial or not.
/// \param Root - Instruction that could be combined with one of its operands
/// \param Pattern - Combination pattern for Root
/// \param InsInstrs - Vector of new instructions that implement P
/// \param DelInstrs - Old instructions, including Root, that could be
/// replaced by InsInstr
/// \param InstrIdxForVirtReg - map of virtual register to instruction in
/// InsInstr that defines it
virtual void genAlternativeCodeSequence(
MachineInstr &Root, MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
/// Attempt to reassociate \P Root and \P Prev according to \P Pattern to
/// reduce critical path length.
void reassociateOps(MachineInstr &Root, MachineInstr &Prev,
MachineCombinerPattern Pattern,
SmallVectorImpl<MachineInstr *> &InsInstrs,
SmallVectorImpl<MachineInstr *> &DelInstrs,
DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
/// This is an architecture-specific helper function of reassociateOps.
/// Set special operand attributes for new instructions after reassociation.
virtual void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
MachineInstr &NewMI1,
MachineInstr &NewMI2) const {
}
/// Return true when a target supports MachineCombiner.
virtual bool useMachineCombiner() const { return false; }
protected:
/// Target-dependent implementation for foldMemoryOperand.
/// Target-independent code in foldMemoryOperand will
/// take care of adding a MachineMemOperand to the newly created instruction.
/// The instruction and any auxiliary instructions necessary will be inserted
/// at InsertPt.
virtual MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
LiveIntervals *LIS = nullptr) const {
return nullptr;
}
/// Target-dependent implementation for foldMemoryOperand.
/// Target-independent code in foldMemoryOperand will
/// take care of adding a MachineMemOperand to the newly created instruction.
/// The instruction and any auxiliary instructions necessary will be inserted
/// at InsertPt.
virtual MachineInstr *foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS = nullptr) const {
return nullptr;
}
/// \brief Target-dependent implementation of getRegSequenceInputs.
///
/// \returns true if it is possible to build the equivalent
/// REG_SEQUENCE inputs with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isRegSequenceLike().
///
/// \see TargetInstrInfo::getRegSequenceInputs.
virtual bool getRegSequenceLikeInputs(
const MachineInstr &MI, unsigned DefIdx,
SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
return false;
}
/// \brief Target-dependent implementation of getExtractSubregInputs.
///
/// \returns true if it is possible to build the equivalent
/// EXTRACT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isExtractSubregLike().
///
/// \see TargetInstrInfo::getExtractSubregInputs.
virtual bool getExtractSubregLikeInputs(
const MachineInstr &MI, unsigned DefIdx,
RegSubRegPairAndIdx &InputReg) const {
return false;
}
/// \brief Target-dependent implementation of getInsertSubregInputs.
///
/// \returns true if it is possible to build the equivalent
/// INSERT_SUBREG inputs with the pair \p MI, \p DefIdx. False otherwise.
///
/// \pre MI.isInsertSubregLike().
///
/// \see TargetInstrInfo::getInsertSubregInputs.
virtual bool
getInsertSubregLikeInputs(const MachineInstr &MI, unsigned DefIdx,
RegSubRegPair &BaseReg,
RegSubRegPairAndIdx &InsertedReg) const {
return false;
}
public:
/// unfoldMemoryOperand - Separate a single instruction which folded a load or
/// a store or a load and a store into two or more instruction. If this is
/// possible, returns true as well as the new instructions by reference.
virtual bool
unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr *> &NewMIs) const {
return false;
}
virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
SmallVectorImpl<SDNode*> &NewNodes) const {
return false;
}
/// Returns the opcode of the would be new
/// instruction after load / store are unfolded from an instruction of the
/// specified opcode. It returns zero if the specified unfolding is not
/// possible. If LoadRegIndex is non-null, it is filled in with the operand
/// index of the operand which will hold the register holding the loaded
/// value.
virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
unsigned *LoadRegIndex = nullptr) const {
return 0;
}
/// This is used by the pre-regalloc scheduler to determine if two loads are
/// loading from the same base address. It should only return true if the base
/// pointers are the same and the only differences between the two addresses
/// are the offset. It also returns the offsets by reference.
virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1, int64_t &Offset2) const {
return false;
}
/// This is a used by the pre-regalloc scheduler to determine (in conjunction
/// with areLoadsFromSameBasePtr) if two loads should be scheduled together.
/// On some targets if two loads are loading from
/// addresses in the same cache line, it's better if they are scheduled
/// together. This function takes two integers that represent the load offsets
/// from the common base address. It returns true if it decides it's desirable
/// to schedule the two loads together. "NumLoads" is the number of loads that
/// have already been scheduled after Load1.
virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const {
return false;
}
/// Get the base register and byte offset of an instruction that reads/writes
/// memory.
virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
return false;
}
/// Return true if the instruction contains a base register and offset. If
/// true, the function also sets the operand position in the instruction
/// for the base register and offset.
virtual bool getBaseAndOffsetPosition(const MachineInstr &MI,
unsigned &BasePos,
unsigned &OffsetPos) const {
return false;
}
/// If the instruction is an increment of a constant value, return the amount.
virtual bool getIncrementValue(const MachineInstr &MI, int &Value) const {
return false;
}
/// Returns true if the two given memory operations should be scheduled
/// adjacent. Note that you have to add:
/// DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
/// or
/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
/// to TargetPassConfig::createMachineScheduler() to have an effect.
virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt,
MachineInstr &SecondLdSt,
unsigned NumLoads) const {
llvm_unreachable("target did not implement shouldClusterMemOps()");
}
/// Can this target fuse the given instructions if they are scheduled
/// adjacent. Note that you have to add:
/// DAG.addMutation(createMacroFusionDAGMutation());
/// to TargetPassConfig::createMachineScheduler() to have an effect.
virtual bool shouldScheduleAdjacent(const MachineInstr &First,
const MachineInstr &Second) const {
llvm_unreachable("target did not implement shouldScheduleAdjacent()");
}
/// Reverses the branch condition of the specified condition list,
/// returning false on success and true if it cannot be reversed.
virtual
bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
return true;
}
/// Insert a noop into the instruction stream at the specified point.
virtual void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const;
/// Return the noop instruction to use for a noop.
virtual void getNoopForMachoTarget(MCInst &NopInst) const;
/// Return true for post-incremented instructions.
virtual bool isPostIncrement(const MachineInstr &MI) const {
return false;
}
/// Returns true if the instruction is already predicated.
virtual bool isPredicated(const MachineInstr &MI) const {
return false;
}
/// Returns true if the instruction is a
/// terminator instruction that has not been predicated.
virtual bool isUnpredicatedTerminator(const MachineInstr &MI) const;
- /// Returns true if MI is an unconditional tail call.
- virtual bool isUnconditionalTailCall(const MachineInstr &MI) const {
- return false;
- }
-
- /// Returns true if the tail call can be made conditional on BranchCond.
- virtual bool
- canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
- const MachineInstr &TailCall) const {
- return false;
- }
-
- /// Replace the conditional branch in MBB with a conditional tail call.
- virtual void replaceBranchWithTailCall(MachineBasicBlock &MBB,
- SmallVectorImpl<MachineOperand> &Cond,
- const MachineInstr &TailCall) const {
- llvm_unreachable("Target didn't implement replaceBranchWithTailCall!");
- }
-
/// Convert the instruction into a predicated instruction.
/// It returns true if the operation was successful.
virtual bool PredicateInstruction(MachineInstr &MI,
ArrayRef<MachineOperand> Pred) const;
/// Returns true if the first specified predicate
/// subsumes the second, e.g. GE subsumes GT.
virtual
bool SubsumesPredicate(ArrayRef<MachineOperand> Pred1,
ArrayRef<MachineOperand> Pred2) const {
return false;
}
/// If the specified instruction defines any predicate
/// or condition code register(s) used for predication, returns true as well
/// as the definition predicate(s) by reference.
virtual bool DefinesPredicate(MachineInstr &MI,
std::vector<MachineOperand> &Pred) const {
return false;
}
/// Return true if the specified instruction can be predicated.
/// By default, this returns true for every instruction with a
/// PredicateOperand.
virtual bool isPredicable(MachineInstr &MI) const {
return MI.getDesc().isPredicable();
}
/// Return true if it's safe to move a machine
/// instruction that defines the specified register class.
virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
return true;
}
/// Test if the given instruction should be considered a scheduling boundary.
/// This primarily includes labels and terminators.
virtual bool isSchedulingBoundary(const MachineInstr &MI,
const MachineBasicBlock *MBB,
const MachineFunction &MF) const;
/// Measure the specified inline asm to determine an approximation of its
/// length.
virtual unsigned getInlineAsmLength(const char *Str,
const MCAsmInfo &MAI) const;
/// Allocate and return a hazard recognizer to use for this target when
/// scheduling the machine instructions before register allocation.
virtual ScheduleHazardRecognizer*
CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
const ScheduleDAG *DAG) const;
/// Allocate and return a hazard recognizer to use for this target when
/// scheduling the machine instructions before register allocation.
virtual ScheduleHazardRecognizer*
CreateTargetMIHazardRecognizer(const InstrItineraryData*,
const ScheduleDAG *DAG) const;
/// Allocate and return a hazard recognizer to use for this target when
/// scheduling the machine instructions after register allocation.
virtual ScheduleHazardRecognizer*
CreateTargetPostRAHazardRecognizer(const InstrItineraryData*,
const ScheduleDAG *DAG) const;
/// Allocate and return a hazard recognizer to use for by non-scheduling
/// passes.
virtual ScheduleHazardRecognizer*
CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const {
return nullptr;
}
/// Provide a global flag for disabling the PreRA hazard recognizer that
/// targets may choose to honor.
bool usePreRAHazardRecognizer() const;
/// For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
virtual bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &Mask, int &Value) const {
return false;
}
/// See if the comparison instruction can be converted
/// into something more efficient. E.g., on ARM most instructions can set the
/// flags register, obviating the need for a separate CMP.
virtual bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int Mask, int Value,
const MachineRegisterInfo *MRI) const {
return false;
}
virtual bool optimizeCondBranch(MachineInstr &MI) const { return false; }
/// Try to remove the load by folding it to a register operand at the use.
/// We fold the load instructions if and only if the
/// def and use are in the same BB. We only look at one load and see
/// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
/// defined by the load we are trying to fold. DefMI returns the machine
/// instruction that defines FoldAsLoadDefReg, and the function returns
/// the machine instruction generated due to folding.
virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
return nullptr;
}
/// 'Reg' is known to be defined by a move immediate instruction,
/// try to fold the immediate into the use instruction.
/// If MRI->hasOneNonDBGUse(Reg) is true, and this function returns true,
/// then the caller may assume that DefMI has been erased from its parent
/// block. The caller may assume that it will not be erased by this
/// function otherwise.
virtual bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
unsigned Reg, MachineRegisterInfo *MRI) const {
return false;
}
/// Return the number of u-operations the given machine
/// instruction will be decoded to on the target cpu. The itinerary's
/// IssueWidth is the number of microops that can be dispatched each
/// cycle. An instruction with zero microops takes no dispatch resources.
virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
const MachineInstr &MI) const;
/// Return true for pseudo instructions that don't consume any
/// machine resources in their current form. These are common cases that the
/// scheduler should consider free, rather than conservatively handling them
/// as instructions with no itinerary.
bool isZeroCost(unsigned Opcode) const {
return Opcode <= TargetOpcode::COPY;
}
virtual int getOperandLatency(const InstrItineraryData *ItinData,
SDNode *DefNode, unsigned DefIdx,
SDNode *UseNode, unsigned UseIdx) const;
/// Compute and return the use operand latency of a given pair of def and use.
/// In most cases, the static scheduling itinerary was enough to determine the
/// operand latency. But it may not be possible for instructions with variable
/// number of defs / uses.
///
/// This is a raw interface to the itinerary that may be directly overridden
/// by a target. Use computeOperandLatency to get the best estimate of
/// latency.
virtual int getOperandLatency(const InstrItineraryData *ItinData,
const MachineInstr &DefMI, unsigned DefIdx,
const MachineInstr &UseMI,
unsigned UseIdx) const;
/// Compute the instruction latency of a given instruction.
/// If the instruction has higher cost when predicated, it's returned via
/// PredCost.
virtual unsigned getInstrLatency(const InstrItineraryData *ItinData,
const MachineInstr &MI,
unsigned *PredCost = nullptr) const;
virtual unsigned getPredicationCost(const MachineInstr &MI) const;
virtual int getInstrLatency(const InstrItineraryData *ItinData,
SDNode *Node) const;
/// Return the default expected latency for a def based on its opcode.
unsigned defaultDefLatency(const MCSchedModel &SchedModel,
const MachineInstr &DefMI) const;
int computeDefOperandLatency(const InstrItineraryData *ItinData,
const MachineInstr &DefMI) const;
/// Return true if this opcode has high latency to its result.
virtual bool isHighLatencyDef(int opc) const { return false; }
/// Compute operand latency between a def of 'Reg'
/// and a use in the current loop. Return true if the target considered
/// it 'high'. This is used by optimization passes such as machine LICM to
/// determine whether it makes sense to hoist an instruction out even in a
/// high register pressure situation.
virtual bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
const MachineRegisterInfo *MRI,
const MachineInstr &DefMI, unsigned DefIdx,
const MachineInstr &UseMI,
unsigned UseIdx) const {
return false;
}
/// Compute operand latency of a def of 'Reg'. Return true
/// if the target considered it 'low'.
virtual bool hasLowDefLatency(const TargetSchedModel &SchedModel,
const MachineInstr &DefMI,
unsigned DefIdx) const;
/// Perform target-specific instruction verification.
virtual bool verifyInstruction(const MachineInstr &MI,
StringRef &ErrInfo) const {
return true;
}
/// Return the current execution domain and bit mask of
/// possible domains for instruction.
///
/// Some micro-architectures have multiple execution domains, and multiple
/// opcodes that perform the same operation in different domains. For
/// example, the x86 architecture provides the por, orps, and orpd
/// instructions that all do the same thing. There is a latency penalty if a
/// register is written in one domain and read in another.
///
/// This function returns a pair (domain, mask) containing the execution
/// domain of MI, and a bit mask of possible domains. The setExecutionDomain
/// function can be used to change the opcode to one of the domains in the
/// bit mask. Instructions whose execution domain can't be changed should
/// return a 0 mask.
///
/// The execution domain numbers don't have any special meaning except domain
/// 0 is used for instructions that are not associated with any interesting
/// execution domain.
///
virtual std::pair<uint16_t, uint16_t>
getExecutionDomain(const MachineInstr &MI) const {
return std::make_pair(0, 0);
}
/// Change the opcode of MI to execute in Domain.
///
/// The bit (1 << Domain) must be set in the mask returned from
/// getExecutionDomain(MI).
virtual void setExecutionDomain(MachineInstr &MI, unsigned Domain) const {}
/// Returns the preferred minimum clearance
/// before an instruction with an unwanted partial register update.
///
/// Some instructions only write part of a register, and implicitly need to
/// read the other parts of the register. This may cause unwanted stalls
/// preventing otherwise unrelated instructions from executing in parallel in
/// an out-of-order CPU.
///
/// For example, the x86 instruction cvtsi2ss writes its result to bits
/// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
/// the instruction needs to wait for the old value of the register to become
/// available:
///
/// addps %xmm1, %xmm0
/// movaps %xmm0, (%rax)
/// cvtsi2ss %rbx, %xmm0
///
/// In the code above, the cvtsi2ss instruction needs to wait for the addps
/// instruction before it can issue, even though the high bits of %xmm0
/// probably aren't needed.
///
/// This hook returns the preferred clearance before MI, measured in
/// instructions. Other defs of MI's operand OpNum are avoided in the last N
/// instructions before MI. It should only return a positive value for
/// unwanted dependencies. If the old bits of the defined register have
/// useful values, or if MI is determined to otherwise read the dependency,
/// the hook should return 0.
///
/// The unwanted dependency may be handled by:
///
/// 1. Allocating the same register for an MI def and use. That makes the
/// unwanted dependency identical to a required dependency.
///
/// 2. Allocating a register for the def that has no defs in the previous N
/// instructions.
///
/// 3. Calling breakPartialRegDependency() with the same arguments. This
/// allows the target to insert a dependency breaking instruction.
///
virtual unsigned
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
// The default implementation returns 0 for no partial register dependency.
return 0;
}
/// \brief Return the minimum clearance before an instruction that reads an
/// unused register.
///
/// For example, AVX instructions may copy part of a register operand into
/// the unused high bits of the destination register.
///
/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
///
/// In the code above, vcvtsi2sdq copies %xmm0[127:64] into %xmm14 creating a
/// false dependence on any previous write to %xmm0.
///
/// This hook works similarly to getPartialRegUpdateClearance, except that it
/// does not take an operand index. Instead sets \p OpNum to the index of the
/// unused register.
virtual unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
// The default implementation returns 0 for no undef register dependency.
return 0;
}
/// Insert a dependency-breaking instruction
/// before MI to eliminate an unwanted dependency on OpNum.
///
/// If it wasn't possible to avoid a def in the last N instructions before MI
/// (see getPartialRegUpdateClearance), this hook will be called to break the
/// unwanted dependency.
///
/// On x86, an xorps instruction can be used as a dependency breaker:
///
/// addps %xmm1, %xmm0
/// movaps %xmm0, (%rax)
/// xorps %xmm0, %xmm0
/// cvtsi2ss %rbx, %xmm0
///
/// An <imp-kill> operand should be added to MI if an instruction was
/// inserted. This ties the instructions together in the post-ra scheduler.
///
virtual void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {}
/// Create machine specific model for scheduling.
virtual DFAPacketizer *
CreateTargetScheduleState(const TargetSubtargetInfo &) const {
return nullptr;
}
// Sometimes, it is possible for the target
// to tell, even without aliasing information, that two MIs access different
// memory addresses. This function returns true if two MIs access different
// memory addresses and false otherwise.
virtual bool
areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
AliasAnalysis *AA = nullptr) const {
assert((MIa.mayLoad() || MIa.mayStore()) &&
"MIa must load from or modify a memory location");
assert((MIb.mayLoad() || MIb.mayStore()) &&
"MIb must load from or modify a memory location");
return false;
}
/// \brief Return the value to use for the MachineCSE's LookAheadLimit,
/// which is a heuristic used for CSE'ing phys reg defs.
virtual unsigned getMachineCSELookAheadLimit () const {
// The default lookahead is small to prevent unprofitable quadratic
// behavior.
return 5;
}
/// Return an array that contains the ids of the target indices (used for the
/// TargetIndex machine operand) and their names.
///
/// MIR Serialization is able to serialize only the target indices that are
/// defined by this method.
virtual ArrayRef<std::pair<int, const char *>>
getSerializableTargetIndices() const {
return None;
}
/// Decompose the machine operand's target flags into two values - the direct
/// target flag value and any of bit flags that are applied.
virtual std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned /*TF*/) const {
return std::make_pair(0u, 0u);
}
/// Return an array that contains the direct target flag values and their
/// names.
///
/// MIR Serialization is able to serialize only the target flags that are
/// defined by this method.
virtual ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const {
return None;
}
/// Return an array that contains the bitmask target flag values and their
/// names.
///
/// MIR Serialization is able to serialize only the target flags that are
/// defined by this method.
virtual ArrayRef<std::pair<unsigned, const char *>>
getSerializableBitmaskMachineOperandTargetFlags() const {
return None;
}
/// Determines whether |Inst| is a tail call instruction.
virtual bool isTailCall(const MachineInstr &Inst) const {
return false;
}
private:
unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
unsigned CatchRetOpcode;
unsigned ReturnOpcode;
};
/// \brief Provide DenseMapInfo for TargetInstrInfo::RegSubRegPair.
template<>
struct DenseMapInfo<TargetInstrInfo::RegSubRegPair> {
typedef DenseMapInfo<unsigned> RegInfo;
static inline TargetInstrInfo::RegSubRegPair getEmptyKey() {
return TargetInstrInfo::RegSubRegPair(RegInfo::getEmptyKey(),
RegInfo::getEmptyKey());
}
static inline TargetInstrInfo::RegSubRegPair getTombstoneKey() {
return TargetInstrInfo::RegSubRegPair(RegInfo::getTombstoneKey(),
RegInfo::getTombstoneKey());
}
/// \brief Reuse getHashValue implementation from
/// std::pair<unsigned, unsigned>.
static unsigned getHashValue(const TargetInstrInfo::RegSubRegPair &Val) {
std::pair<unsigned, unsigned> PairVal =
std::make_pair(Val.Reg, Val.SubReg);
return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
}
static bool isEqual(const TargetInstrInfo::RegSubRegPair &LHS,
const TargetInstrInfo::RegSubRegPair &RHS) {
return RegInfo::isEqual(LHS.Reg, RHS.Reg) &&
RegInfo::isEqual(LHS.SubReg, RHS.SubReg);
}
};
} // end namespace llvm
#endif // LLVM_TARGET_TARGETINSTRINFO_H
Index: projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Bitcode/Reader/MetadataLoader.cpp (revision 313643)
@@ -1,1804 +1,1850 @@
//===- MetadataLoader.cpp - Internal BitcodeReader implementation ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "MetadataLoader.h"
#include "ValueList.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/Bitcode/BitcodeReader.h"
#include "llvm/Bitcode/BitstreamReader.h"
#include "llvm/Bitcode/LLVMBitCodes.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/AutoUpgrade.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Comdat.h"
#include "llvm/IR/Constant.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/DiagnosticPrinter.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GVMaterializer.h"
#include "llvm/IR/GlobalAlias.h"
#include "llvm/IR/GlobalIFunc.h"
#include "llvm/IR/GlobalIndirectSymbol.h"
#include "llvm/IR/GlobalObject.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/ModuleSummaryIndex.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/TrackingMDRef.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/Error.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/ManagedStatic.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/raw_ostream.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <deque>
#include <limits>
#include <map>
#include <memory>
#include <string>
#include <system_error>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "bitcode-reader"
STATISTIC(NumMDStringLoaded, "Number of MDStrings loaded");
STATISTIC(NumMDNodeTemporary, "Number of MDNode::Temporary created");
STATISTIC(NumMDRecordLoaded, "Number of Metadata records loaded");
/// Flag whether we need to import full type definitions for ThinLTO.
/// Currently needed for Darwin and LLDB.
static cl::opt<bool> ImportFullTypeDefinitions(
"import-full-type-definitions", cl::init(false), cl::Hidden,
cl::desc("Import full type definitions for ThinLTO."));
static cl::opt<bool> DisableLazyLoading(
"disable-ondemand-mds-loading", cl::init(false), cl::Hidden,
cl::desc("Force disable the lazy-loading on-demand of metadata when "
"loading bitcode for importing."));
namespace {
static int64_t unrotateSign(uint64_t U) { return U & 1 ? ~(U >> 1) : U >> 1; }
class BitcodeReaderMetadataList {
/// Array of metadata references.
///
/// Don't use std::vector here. Some versions of libc++ copy (instead of
/// move) on resize, and TrackingMDRef is very expensive to copy.
SmallVector<TrackingMDRef, 1> MetadataPtrs;
/// The set of indices in MetadataPtrs above of forward references that were
/// generated.
SmallDenseSet<unsigned, 1> ForwardReference;
/// The set of indices in MetadataPtrs above of Metadata that need to be
/// resolved.
SmallDenseSet<unsigned, 1> UnresolvedNodes;
/// Structures for resolving old type refs.
struct {
SmallDenseMap<MDString *, TempMDTuple, 1> Unknown;
SmallDenseMap<MDString *, DICompositeType *, 1> Final;
SmallDenseMap<MDString *, DICompositeType *, 1> FwdDecls;
SmallVector<std::pair<TrackingMDRef, TempMDTuple>, 1> Arrays;
} OldTypeRefs;
LLVMContext &Context;
public:
BitcodeReaderMetadataList(LLVMContext &C) : Context(C) {}
// vector compatibility methods
unsigned size() const { return MetadataPtrs.size(); }
void resize(unsigned N) { MetadataPtrs.resize(N); }
void push_back(Metadata *MD) { MetadataPtrs.emplace_back(MD); }
void clear() { MetadataPtrs.clear(); }
Metadata *back() const { return MetadataPtrs.back(); }
void pop_back() { MetadataPtrs.pop_back(); }
bool empty() const { return MetadataPtrs.empty(); }
Metadata *operator[](unsigned i) const {
assert(i < MetadataPtrs.size());
return MetadataPtrs[i];
}
Metadata *lookup(unsigned I) const {
if (I < MetadataPtrs.size())
return MetadataPtrs[I];
return nullptr;
}
void shrinkTo(unsigned N) {
assert(N <= size() && "Invalid shrinkTo request!");
assert(ForwardReference.empty() && "Unexpected forward refs");
assert(UnresolvedNodes.empty() && "Unexpected unresolved node");
MetadataPtrs.resize(N);
}
/// Return the given metadata, creating a replaceable forward reference if
/// necessary.
Metadata *getMetadataFwdRef(unsigned Idx);
/// Return the the given metadata only if it is fully resolved.
///
/// Gives the same result as \a lookup(), unless \a MDNode::isResolved()
/// would give \c false.
Metadata *getMetadataIfResolved(unsigned Idx);
MDNode *getMDNodeFwdRefOrNull(unsigned Idx);
void assignValue(Metadata *MD, unsigned Idx);
void tryToResolveCycles();
bool hasFwdRefs() const { return !ForwardReference.empty(); }
int getNextFwdRef() {
assert(hasFwdRefs());
return *ForwardReference.begin();
}
/// Upgrade a type that had an MDString reference.
void addTypeRef(MDString &UUID, DICompositeType &CT);
/// Upgrade a type that had an MDString reference.
Metadata *upgradeTypeRef(Metadata *MaybeUUID);
/// Upgrade a type ref array that may have MDString references.
Metadata *upgradeTypeRefArray(Metadata *MaybeTuple);
private:
Metadata *resolveTypeRefArray(Metadata *MaybeTuple);
};
void BitcodeReaderMetadataList::assignValue(Metadata *MD, unsigned Idx) {
if (auto *MDN = dyn_cast<MDNode>(MD))
if (!MDN->isResolved())
UnresolvedNodes.insert(Idx);
if (Idx == size()) {
push_back(MD);
return;
}
if (Idx >= size())
resize(Idx + 1);
TrackingMDRef &OldMD = MetadataPtrs[Idx];
if (!OldMD) {
OldMD.reset(MD);
return;
}
// If there was a forward reference to this value, replace it.
TempMDTuple PrevMD(cast<MDTuple>(OldMD.get()));
PrevMD->replaceAllUsesWith(MD);
ForwardReference.erase(Idx);
}
Metadata *BitcodeReaderMetadataList::getMetadataFwdRef(unsigned Idx) {
if (Idx >= size())
resize(Idx + 1);
if (Metadata *MD = MetadataPtrs[Idx])
return MD;
// Track forward refs to be resolved later.
ForwardReference.insert(Idx);
// Create and return a placeholder, which will later be RAUW'd.
++NumMDNodeTemporary;
Metadata *MD = MDNode::getTemporary(Context, None).release();
MetadataPtrs[Idx].reset(MD);
return MD;
}
Metadata *BitcodeReaderMetadataList::getMetadataIfResolved(unsigned Idx) {
Metadata *MD = lookup(Idx);
if (auto *N = dyn_cast_or_null<MDNode>(MD))
if (!N->isResolved())
return nullptr;
return MD;
}
MDNode *BitcodeReaderMetadataList::getMDNodeFwdRefOrNull(unsigned Idx) {
return dyn_cast_or_null<MDNode>(getMetadataFwdRef(Idx));
}
void BitcodeReaderMetadataList::tryToResolveCycles() {
if (!ForwardReference.empty())
// Still forward references... can't resolve cycles.
return;
// Give up on finding a full definition for any forward decls that remain.
for (const auto &Ref : OldTypeRefs.FwdDecls)
OldTypeRefs.Final.insert(Ref);
OldTypeRefs.FwdDecls.clear();
// Upgrade from old type ref arrays. In strange cases, this could add to
// OldTypeRefs.Unknown.
for (const auto &Array : OldTypeRefs.Arrays)
Array.second->replaceAllUsesWith(resolveTypeRefArray(Array.first.get()));
OldTypeRefs.Arrays.clear();
// Replace old string-based type refs with the resolved node, if possible.
// If we haven't seen the node, leave it to the verifier to complain about
// the invalid string reference.
for (const auto &Ref : OldTypeRefs.Unknown) {
if (DICompositeType *CT = OldTypeRefs.Final.lookup(Ref.first))
Ref.second->replaceAllUsesWith(CT);
else
Ref.second->replaceAllUsesWith(Ref.first);
}
OldTypeRefs.Unknown.clear();
if (UnresolvedNodes.empty())
// Nothing to do.
return;
// Resolve any cycles.
for (unsigned I : UnresolvedNodes) {
auto &MD = MetadataPtrs[I];
auto *N = dyn_cast_or_null<MDNode>(MD);
if (!N)
continue;
assert(!N->isTemporary() && "Unexpected forward reference");
N->resolveCycles();
}
// Make sure we return early again until there's another unresolved ref.
UnresolvedNodes.clear();
}
void BitcodeReaderMetadataList::addTypeRef(MDString &UUID,
DICompositeType &CT) {
assert(CT.getRawIdentifier() == &UUID && "Mismatched UUID");
if (CT.isForwardDecl())
OldTypeRefs.FwdDecls.insert(std::make_pair(&UUID, &CT));
else
OldTypeRefs.Final.insert(std::make_pair(&UUID, &CT));
}
Metadata *BitcodeReaderMetadataList::upgradeTypeRef(Metadata *MaybeUUID) {
auto *UUID = dyn_cast_or_null<MDString>(MaybeUUID);
if (LLVM_LIKELY(!UUID))
return MaybeUUID;
if (auto *CT = OldTypeRefs.Final.lookup(UUID))
return CT;
auto &Ref = OldTypeRefs.Unknown[UUID];
if (!Ref)
Ref = MDNode::getTemporary(Context, None);
return Ref.get();
}
Metadata *BitcodeReaderMetadataList::upgradeTypeRefArray(Metadata *MaybeTuple) {
auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
if (!Tuple || Tuple->isDistinct())
return MaybeTuple;
// Look through the array immediately if possible.
if (!Tuple->isTemporary())
return resolveTypeRefArray(Tuple);
// Create and return a placeholder to use for now. Eventually
// resolveTypeRefArrays() will be resolve this forward reference.
OldTypeRefs.Arrays.emplace_back(
std::piecewise_construct, std::forward_as_tuple(Tuple),
std::forward_as_tuple(MDTuple::getTemporary(Context, None)));
return OldTypeRefs.Arrays.back().second.get();
}
Metadata *BitcodeReaderMetadataList::resolveTypeRefArray(Metadata *MaybeTuple) {
auto *Tuple = dyn_cast_or_null<MDTuple>(MaybeTuple);
if (!Tuple || Tuple->isDistinct())
return MaybeTuple;
// Look through the DITypeRefArray, upgrading each DITypeRef.
SmallVector<Metadata *, 32> Ops;
Ops.reserve(Tuple->getNumOperands());
for (Metadata *MD : Tuple->operands())
Ops.push_back(upgradeTypeRef(MD));
return MDTuple::get(Context, Ops);
}
namespace {
class PlaceholderQueue {
// Placeholders would thrash around when moved, so store in a std::deque
// instead of some sort of vector.
std::deque<DistinctMDOperandPlaceholder> PHs;
public:
bool empty() { return PHs.empty(); }
DistinctMDOperandPlaceholder &getPlaceholderOp(unsigned ID);
void flush(BitcodeReaderMetadataList &MetadataList);
/// Return the list of temporaries nodes in the queue, these need to be
/// loaded before we can flush the queue.
void getTemporaries(BitcodeReaderMetadataList &MetadataList,
DenseSet<unsigned> &Temporaries) {
for (auto &PH : PHs) {
auto ID = PH.getID();
auto *MD = MetadataList.lookup(ID);
if (!MD) {
Temporaries.insert(ID);
continue;
}
auto *N = dyn_cast_or_null<MDNode>(MD);
if (N && N->isTemporary())
Temporaries.insert(ID);
}
}
};
} // end anonymous namespace
DistinctMDOperandPlaceholder &PlaceholderQueue::getPlaceholderOp(unsigned ID) {
PHs.emplace_back(ID);
return PHs.back();
}
void PlaceholderQueue::flush(BitcodeReaderMetadataList &MetadataList) {
while (!PHs.empty()) {
auto *MD = MetadataList.lookup(PHs.front().getID());
assert(MD && "Flushing placeholder on unassigned MD");
#ifndef NDEBUG
if (auto *MDN = dyn_cast<MDNode>(MD))
assert(MDN->isResolved() &&
"Flushing Placeholder while cycles aren't resolved");
#endif
PHs.front().replaceUseWith(MD);
PHs.pop_front();
}
}
} // anonynous namespace
class MetadataLoader::MetadataLoaderImpl {
BitcodeReaderMetadataList MetadataList;
BitcodeReaderValueList &ValueList;
BitstreamCursor &Stream;
LLVMContext &Context;
Module &TheModule;
std::function<Type *(unsigned)> getTypeByID;
/// Cursor associated with the lazy-loading of Metadata. This is the easy way
/// to keep around the right "context" (Abbrev list) to be able to jump in
/// the middle of the metadata block and load any record.
BitstreamCursor IndexCursor;
/// Index that keeps track of MDString values.
std::vector<StringRef> MDStringRef;
/// On-demand loading of a single MDString. Requires the index above to be
/// populated.
MDString *lazyLoadOneMDString(unsigned Idx);
/// Index that keeps track of where to find a metadata record in the stream.
std::vector<uint64_t> GlobalMetadataBitPosIndex;
/// Populate the index above to enable lazily loading of metadata, and load
/// the named metadata as well as the transitively referenced global
/// Metadata.
Expected<bool> lazyLoadModuleMetadataBlock();
/// On-demand loading of a single metadata. Requires the index above to be
/// populated.
void lazyLoadOneMetadata(unsigned Idx, PlaceholderQueue &Placeholders);
// Keep mapping of seens pair of old-style CU <-> SP, and update pointers to
// point from SP to CU after a block is completly parsed.
std::vector<std::pair<DICompileUnit *, Metadata *>> CUSubprograms;
/// Functions that need to be matched with subprograms when upgrading old
/// metadata.
SmallDenseMap<Function *, DISubprogram *, 16> FunctionsWithSPs;
// Map the bitcode's custom MDKind ID to the Module's MDKind ID.
DenseMap<unsigned, unsigned> MDKindMap;
bool StripTBAA = false;
bool HasSeenOldLoopTags = false;
+ bool NeedUpgradeToDIGlobalVariableExpression = false;
/// True if metadata is being parsed for a module being ThinLTO imported.
bool IsImporting = false;
Error parseOneMetadata(SmallVectorImpl<uint64_t> &Record, unsigned Code,
PlaceholderQueue &Placeholders, StringRef Blob,
unsigned &NextMetadataNo);
Error parseMetadataStrings(ArrayRef<uint64_t> Record, StringRef Blob,
std::function<void(StringRef)> CallBack);
Error parseGlobalObjectAttachment(GlobalObject &GO,
ArrayRef<uint64_t> Record);
Error parseMetadataKindRecord(SmallVectorImpl<uint64_t> &Record);
void resolveForwardRefsAndPlaceholders(PlaceholderQueue &Placeholders);
/// Upgrade old-style CU <-> SP pointers to point from SP to CU.
void upgradeCUSubprograms() {
for (auto CU_SP : CUSubprograms)
if (auto *SPs = dyn_cast_or_null<MDTuple>(CU_SP.second))
for (auto &Op : SPs->operands())
if (auto *SP = dyn_cast_or_null<MDNode>(Op))
SP->replaceOperandWith(7, CU_SP.first);
CUSubprograms.clear();
}
+ /// Upgrade old-style bare DIGlobalVariables to DIGlobalVariableExpressions.
+ void upgradeCUVariables() {
+ if (!NeedUpgradeToDIGlobalVariableExpression)
+ return;
+
+ // Upgrade list of variables attached to the CUs.
+ if (NamedMDNode *CUNodes = TheModule.getNamedMetadata("llvm.dbg.cu"))
+ for (unsigned I = 0, E = CUNodes->getNumOperands(); I != E; ++I) {
+ auto *CU = cast<DICompileUnit>(CUNodes->getOperand(I));
+ if (auto *GVs = dyn_cast_or_null<MDTuple>(CU->getRawGlobalVariables()))
+ for (unsigned I = 0; I < GVs->getNumOperands(); I++)
+ if (auto *GV =
+ dyn_cast_or_null<DIGlobalVariable>(GVs->getOperand(I))) {
+ auto *DGVE =
+ DIGlobalVariableExpression::getDistinct(Context, GV, nullptr);
+ GVs->replaceOperandWith(I, DGVE);
+ }
+ }
+
+ // Upgrade variables attached to globals.
+ for (auto &GV : TheModule.globals()) {
+ SmallVector<MDNode *, 1> MDs, NewMDs;
+ GV.getMetadata(LLVMContext::MD_dbg, MDs);
+ GV.eraseMetadata(LLVMContext::MD_dbg);
+ for (auto *MD : MDs)
+ if (auto *DGV = dyn_cast_or_null<DIGlobalVariable>(MD)) {
+ auto *DGVE =
+ DIGlobalVariableExpression::getDistinct(Context, DGV, nullptr);
+ GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
+ } else
+ GV.addMetadata(LLVMContext::MD_dbg, *MD);
+ }
+ }
+
+ void upgradeDebugInfo() {
+ upgradeCUSubprograms();
+ upgradeCUVariables();
+ }
+
public:
MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule,
BitcodeReaderValueList &ValueList,
std::function<Type *(unsigned)> getTypeByID,
bool IsImporting)
: MetadataList(TheModule.getContext()), ValueList(ValueList),
Stream(Stream), Context(TheModule.getContext()), TheModule(TheModule),
getTypeByID(getTypeByID), IsImporting(IsImporting) {}
Error parseMetadata(bool ModuleLevel);
bool hasFwdRefs() const { return MetadataList.hasFwdRefs(); }
Metadata *getMetadataFwdRefOrLoad(unsigned ID) {
if (ID < MDStringRef.size())
return lazyLoadOneMDString(ID);
if (auto *MD = MetadataList.lookup(ID))
return MD;
// If lazy-loading is enabled, we try recursively to load the operand
// instead of creating a temporary.
if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
PlaceholderQueue Placeholders;
lazyLoadOneMetadata(ID, Placeholders);
resolveForwardRefsAndPlaceholders(Placeholders);
return MetadataList.lookup(ID);
}
return MetadataList.getMetadataFwdRef(ID);
}
MDNode *getMDNodeFwdRefOrNull(unsigned Idx) {
return MetadataList.getMDNodeFwdRefOrNull(Idx);
}
DISubprogram *lookupSubprogramForFunction(Function *F) {
return FunctionsWithSPs.lookup(F);
}
bool hasSeenOldLoopTags() { return HasSeenOldLoopTags; }
Error parseMetadataAttachment(
Function &F, const SmallVectorImpl<Instruction *> &InstructionList);
Error parseMetadataKinds();
void setStripTBAA(bool Value) { StripTBAA = Value; }
bool isStrippingTBAA() { return StripTBAA; }
unsigned size() const { return MetadataList.size(); }
void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
};
Error error(const Twine &Message) {
return make_error<StringError>(
Message, make_error_code(BitcodeError::CorruptedBitcode));
}
Expected<bool>
MetadataLoader::MetadataLoaderImpl::lazyLoadModuleMetadataBlock() {
IndexCursor = Stream;
SmallVector<uint64_t, 64> Record;
// Get the abbrevs, and preload record positions to make them lazy-loadable.
while (true) {
BitstreamEntry Entry = IndexCursor.advanceSkippingSubblocks(
BitstreamCursor::AF_DontPopBlockAtEnd);
switch (Entry.Kind) {
case BitstreamEntry::SubBlock: // Handled for us already.
case BitstreamEntry::Error:
return error("Malformed block");
case BitstreamEntry::EndBlock: {
return true;
}
case BitstreamEntry::Record: {
// The interesting case.
++NumMDRecordLoaded;
uint64_t CurrentPos = IndexCursor.GetCurrentBitNo();
auto Code = IndexCursor.skipRecord(Entry.ID);
switch (Code) {
case bitc::METADATA_STRINGS: {
// Rewind and parse the strings.
IndexCursor.JumpToBit(CurrentPos);
StringRef Blob;
Record.clear();
IndexCursor.readRecord(Entry.ID, Record, &Blob);
unsigned NumStrings = Record[0];
MDStringRef.reserve(NumStrings);
auto IndexNextMDString = [&](StringRef Str) {
MDStringRef.push_back(Str);
};
if (auto Err = parseMetadataStrings(Record, Blob, IndexNextMDString))
return std::move(Err);
break;
}
case bitc::METADATA_INDEX_OFFSET: {
// This is the offset to the index, when we see this we skip all the
// records and load only an index to these.
IndexCursor.JumpToBit(CurrentPos);
Record.clear();
IndexCursor.readRecord(Entry.ID, Record);
if (Record.size() != 2)
return error("Invalid record");
auto Offset = Record[0] + (Record[1] << 32);
auto BeginPos = IndexCursor.GetCurrentBitNo();
IndexCursor.JumpToBit(BeginPos + Offset);
Entry = IndexCursor.advanceSkippingSubblocks(
BitstreamCursor::AF_DontPopBlockAtEnd);
assert(Entry.Kind == BitstreamEntry::Record &&
"Corrupted bitcode: Expected `Record` when trying to find the "
"Metadata index");
Record.clear();
auto Code = IndexCursor.readRecord(Entry.ID, Record);
(void)Code;
assert(Code == bitc::METADATA_INDEX && "Corrupted bitcode: Expected "
"`METADATA_INDEX` when trying "
"to find the Metadata index");
// Delta unpack
auto CurrentValue = BeginPos;
GlobalMetadataBitPosIndex.reserve(Record.size());
for (auto &Elt : Record) {
CurrentValue += Elt;
GlobalMetadataBitPosIndex.push_back(CurrentValue);
}
break;
}
case bitc::METADATA_INDEX:
// We don't expect to get there, the Index is loaded when we encounter
// the offset.
return error("Corrupted Metadata block");
case bitc::METADATA_NAME: {
// Named metadata need to be materialized now and aren't deferred.
IndexCursor.JumpToBit(CurrentPos);
Record.clear();
unsigned Code = IndexCursor.readRecord(Entry.ID, Record);
assert(Code == bitc::METADATA_NAME);
// Read name of the named metadata.
SmallString<8> Name(Record.begin(), Record.end());
Code = IndexCursor.ReadCode();
// Named Metadata comes in two parts, we expect the name to be followed
// by the node
Record.clear();
unsigned NextBitCode = IndexCursor.readRecord(Code, Record);
assert(NextBitCode == bitc::METADATA_NAMED_NODE);
(void)NextBitCode;
// Read named metadata elements.
unsigned Size = Record.size();
NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
for (unsigned i = 0; i != Size; ++i) {
// FIXME: We could use a placeholder here, however NamedMDNode are
// taking MDNode as operand and not using the Metadata infrastructure.
// It is acknowledged by 'TODO: Inherit from Metadata' in the
// NamedMDNode class definition.
MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
assert(MD && "Invalid record");
NMD->addOperand(MD);
}
break;
}
case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
// FIXME: we need to do this early because we don't materialize global
// value explicitly.
IndexCursor.JumpToBit(CurrentPos);
Record.clear();
IndexCursor.readRecord(Entry.ID, Record);
if (Record.size() % 2 == 0)
return error("Invalid record");
unsigned ValueID = Record[0];
if (ValueID >= ValueList.size())
return error("Invalid record");
if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
if (Error Err = parseGlobalObjectAttachment(
*GO, ArrayRef<uint64_t>(Record).slice(1)))
return std::move(Err);
break;
}
case bitc::METADATA_KIND:
case bitc::METADATA_STRING_OLD:
case bitc::METADATA_OLD_FN_NODE:
case bitc::METADATA_OLD_NODE:
case bitc::METADATA_VALUE:
case bitc::METADATA_DISTINCT_NODE:
case bitc::METADATA_NODE:
case bitc::METADATA_LOCATION:
case bitc::METADATA_GENERIC_DEBUG:
case bitc::METADATA_SUBRANGE:
case bitc::METADATA_ENUMERATOR:
case bitc::METADATA_BASIC_TYPE:
case bitc::METADATA_DERIVED_TYPE:
case bitc::METADATA_COMPOSITE_TYPE:
case bitc::METADATA_SUBROUTINE_TYPE:
case bitc::METADATA_MODULE:
case bitc::METADATA_FILE:
case bitc::METADATA_COMPILE_UNIT:
case bitc::METADATA_SUBPROGRAM:
case bitc::METADATA_LEXICAL_BLOCK:
case bitc::METADATA_LEXICAL_BLOCK_FILE:
case bitc::METADATA_NAMESPACE:
case bitc::METADATA_MACRO:
case bitc::METADATA_MACRO_FILE:
case bitc::METADATA_TEMPLATE_TYPE:
case bitc::METADATA_TEMPLATE_VALUE:
case bitc::METADATA_GLOBAL_VAR:
case bitc::METADATA_LOCAL_VAR:
case bitc::METADATA_EXPRESSION:
case bitc::METADATA_OBJC_PROPERTY:
case bitc::METADATA_IMPORTED_ENTITY:
case bitc::METADATA_GLOBAL_VAR_EXPR:
// We don't expect to see any of these, if we see one, give up on
// lazy-loading and fallback.
MDStringRef.clear();
GlobalMetadataBitPosIndex.clear();
return false;
}
break;
}
}
}
}
/// Parse a METADATA_BLOCK. If ModuleLevel is true then we are parsing
/// module level metadata.
Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
if (!ModuleLevel && MetadataList.hasFwdRefs())
return error("Invalid metadata: fwd refs into function blocks");
// Record the entry position so that we can jump back here and efficiently
// skip the whole block in case we lazy-load.
auto EntryPos = Stream.GetCurrentBitNo();
if (Stream.EnterSubBlock(bitc::METADATA_BLOCK_ID))
return error("Invalid record");
SmallVector<uint64_t, 64> Record;
PlaceholderQueue Placeholders;
// We lazy-load module-level metadata: we build an index for each record, and
// then load individual record as needed, starting with the named metadata.
if (ModuleLevel && IsImporting && MetadataList.empty() &&
!DisableLazyLoading) {
auto SuccessOrErr = lazyLoadModuleMetadataBlock();
if (!SuccessOrErr)
return SuccessOrErr.takeError();
if (SuccessOrErr.get()) {
// An index was successfully created and we will be able to load metadata
// on-demand.
MetadataList.resize(MDStringRef.size() +
GlobalMetadataBitPosIndex.size());
// Reading the named metadata created forward references and/or
// placeholders, that we flush here.
resolveForwardRefsAndPlaceholders(Placeholders);
- upgradeCUSubprograms();
+ upgradeDebugInfo();
// Return at the beginning of the block, since it is easy to skip it
// entirely from there.
Stream.ReadBlockEnd(); // Pop the abbrev block context.
Stream.JumpToBit(EntryPos);
if (Stream.SkipBlock())
return error("Invalid record");
return Error::success();
}
// Couldn't load an index, fallback to loading all the block "old-style".
}
unsigned NextMetadataNo = MetadataList.size();
// Read all the records.
while (true) {
BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
switch (Entry.Kind) {
case BitstreamEntry::SubBlock: // Handled for us already.
case BitstreamEntry::Error:
return error("Malformed block");
case BitstreamEntry::EndBlock:
resolveForwardRefsAndPlaceholders(Placeholders);
- upgradeCUSubprograms();
+ upgradeDebugInfo();
return Error::success();
case BitstreamEntry::Record:
// The interesting case.
break;
}
// Read a record.
Record.clear();
StringRef Blob;
++NumMDRecordLoaded;
unsigned Code = Stream.readRecord(Entry.ID, Record, &Blob);
if (Error Err =
parseOneMetadata(Record, Code, Placeholders, Blob, NextMetadataNo))
return Err;
}
}
MDString *MetadataLoader::MetadataLoaderImpl::lazyLoadOneMDString(unsigned ID) {
++NumMDStringLoaded;
if (Metadata *MD = MetadataList.lookup(ID))
return cast<MDString>(MD);
auto MDS = MDString::get(Context, MDStringRef[ID]);
MetadataList.assignValue(MDS, ID);
return MDS;
}
void MetadataLoader::MetadataLoaderImpl::lazyLoadOneMetadata(
unsigned ID, PlaceholderQueue &Placeholders) {
assert(ID < (MDStringRef.size()) + GlobalMetadataBitPosIndex.size());
assert(ID >= MDStringRef.size() && "Unexpected lazy-loading of MDString");
// Lookup first if the metadata hasn't already been loaded.
if (auto *MD = MetadataList.lookup(ID)) {
auto *N = dyn_cast_or_null<MDNode>(MD);
if (!N->isTemporary())
return;
}
SmallVector<uint64_t, 64> Record;
StringRef Blob;
IndexCursor.JumpToBit(GlobalMetadataBitPosIndex[ID - MDStringRef.size()]);
auto Entry = IndexCursor.advanceSkippingSubblocks();
++NumMDRecordLoaded;
unsigned Code = IndexCursor.readRecord(Entry.ID, Record, &Blob);
if (Error Err = parseOneMetadata(Record, Code, Placeholders, Blob, ID))
report_fatal_error("Can't lazyload MD");
}
/// Ensure that all forward-references and placeholders are resolved.
/// Iteratively lazy-loading metadata on-demand if needed.
void MetadataLoader::MetadataLoaderImpl::resolveForwardRefsAndPlaceholders(
PlaceholderQueue &Placeholders) {
DenseSet<unsigned> Temporaries;
while (1) {
// Populate Temporaries with the placeholders that haven't been loaded yet.
Placeholders.getTemporaries(MetadataList, Temporaries);
// If we don't have any temporary, or FwdReference, we're done!
if (Temporaries.empty() && !MetadataList.hasFwdRefs())
break;
// First, load all the temporaries. This can add new placeholders or
// forward references.
for (auto ID : Temporaries)
lazyLoadOneMetadata(ID, Placeholders);
Temporaries.clear();
// Second, load the forward-references. This can also add new placeholders
// or forward references.
while (MetadataList.hasFwdRefs())
lazyLoadOneMetadata(MetadataList.getNextFwdRef(), Placeholders);
}
// At this point we don't have any forward reference remaining, or temporary
// that haven't been loaded. We can safely drop RAUW support and mark cycles
// as resolved.
MetadataList.tryToResolveCycles();
// Finally, everything is in place, we can replace the placeholders operands
// with the final node they refer to.
Placeholders.flush(MetadataList);
}
Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
SmallVectorImpl<uint64_t> &Record, unsigned Code,
PlaceholderQueue &Placeholders, StringRef Blob, unsigned &NextMetadataNo) {
bool IsDistinct = false;
auto getMD = [&](unsigned ID) -> Metadata * {
if (ID < MDStringRef.size())
return lazyLoadOneMDString(ID);
if (!IsDistinct) {
if (auto *MD = MetadataList.lookup(ID))
return MD;
// If lazy-loading is enabled, we try recursively to load the operand
// instead of creating a temporary.
if (ID < (MDStringRef.size() + GlobalMetadataBitPosIndex.size())) {
// Create a temporary for the node that is referencing the operand we
// will lazy-load. It is needed before recursing in case there are
// uniquing cycles.
MetadataList.getMetadataFwdRef(NextMetadataNo);
lazyLoadOneMetadata(ID, Placeholders);
return MetadataList.lookup(ID);
}
// Return a temporary.
return MetadataList.getMetadataFwdRef(ID);
}
if (auto *MD = MetadataList.getMetadataIfResolved(ID))
return MD;
return &Placeholders.getPlaceholderOp(ID);
};
auto getMDOrNull = [&](unsigned ID) -> Metadata * {
if (ID)
return getMD(ID - 1);
return nullptr;
};
auto getMDOrNullWithoutPlaceholders = [&](unsigned ID) -> Metadata * {
if (ID)
return MetadataList.getMetadataFwdRef(ID - 1);
return nullptr;
};
auto getMDString = [&](unsigned ID) -> MDString * {
// This requires that the ID is not really a forward reference. In
// particular, the MDString must already have been resolved.
auto MDS = getMDOrNull(ID);
return cast_or_null<MDString>(MDS);
};
// Support for old type refs.
auto getDITypeRefOrNull = [&](unsigned ID) {
return MetadataList.upgradeTypeRef(getMDOrNull(ID));
};
#define GET_OR_DISTINCT(CLASS, ARGS) \
(IsDistinct ? CLASS::getDistinct ARGS : CLASS::get ARGS)
switch (Code) {
default: // Default behavior: ignore.
break;
case bitc::METADATA_NAME: {
// Read name of the named metadata.
SmallString<8> Name(Record.begin(), Record.end());
Record.clear();
Code = Stream.ReadCode();
++NumMDRecordLoaded;
unsigned NextBitCode = Stream.readRecord(Code, Record);
if (NextBitCode != bitc::METADATA_NAMED_NODE)
return error("METADATA_NAME not followed by METADATA_NAMED_NODE");
// Read named metadata elements.
unsigned Size = Record.size();
NamedMDNode *NMD = TheModule.getOrInsertNamedMetadata(Name);
for (unsigned i = 0; i != Size; ++i) {
MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[i]);
if (!MD)
return error("Invalid record");
NMD->addOperand(MD);
}
break;
}
case bitc::METADATA_OLD_FN_NODE: {
// FIXME: Remove in 4.0.
// This is a LocalAsMetadata record, the only type of function-local
// metadata.
if (Record.size() % 2 == 1)
return error("Invalid record");
// If this isn't a LocalAsMetadata record, we're dropping it. This used
// to be legal, but there's no upgrade path.
auto dropRecord = [&] {
MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo);
NextMetadataNo++;
};
if (Record.size() != 2) {
dropRecord();
break;
}
Type *Ty = getTypeByID(Record[0]);
if (Ty->isMetadataTy() || Ty->isVoidTy()) {
dropRecord();
break;
}
MetadataList.assignValue(
LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_OLD_NODE: {
// FIXME: Remove in 4.0.
if (Record.size() % 2 == 1)
return error("Invalid record");
unsigned Size = Record.size();
SmallVector<Metadata *, 8> Elts;
for (unsigned i = 0; i != Size; i += 2) {
Type *Ty = getTypeByID(Record[i]);
if (!Ty)
return error("Invalid record");
if (Ty->isMetadataTy())
Elts.push_back(getMD(Record[i + 1]));
else if (!Ty->isVoidTy()) {
auto *MD =
ValueAsMetadata::get(ValueList.getValueFwdRef(Record[i + 1], Ty));
assert(isa<ConstantAsMetadata>(MD) &&
"Expected non-function-local metadata");
Elts.push_back(MD);
} else
Elts.push_back(nullptr);
}
MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_VALUE: {
if (Record.size() != 2)
return error("Invalid record");
Type *Ty = getTypeByID(Record[0]);
if (Ty->isMetadataTy() || Ty->isVoidTy())
return error("Invalid record");
MetadataList.assignValue(
ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_DISTINCT_NODE:
IsDistinct = true;
LLVM_FALLTHROUGH;
case bitc::METADATA_NODE: {
SmallVector<Metadata *, 8> Elts;
Elts.reserve(Record.size());
for (unsigned ID : Record)
Elts.push_back(getMDOrNull(ID));
MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
: MDNode::get(Context, Elts),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_LOCATION: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
unsigned Line = Record[1];
unsigned Column = Record[2];
Metadata *Scope = getMD(Record[3]);
Metadata *InlinedAt = getMDOrNull(Record[4]);
MetadataList.assignValue(
GET_OR_DISTINCT(DILocation, (Context, Line, Column, Scope, InlinedAt)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_GENERIC_DEBUG: {
if (Record.size() < 4)
return error("Invalid record");
IsDistinct = Record[0];
unsigned Tag = Record[1];
unsigned Version = Record[2];
if (Tag >= 1u << 16 || Version != 0)
return error("Invalid record");
auto *Header = getMDString(Record[3]);
SmallVector<Metadata *, 8> DwarfOps;
for (unsigned I = 4, E = Record.size(); I != E; ++I)
DwarfOps.push_back(getMDOrNull(Record[I]));
MetadataList.assignValue(
GET_OR_DISTINCT(GenericDINode, (Context, Tag, Header, DwarfOps)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_SUBRANGE: {
if (Record.size() != 3)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DISubrange,
(Context, Record[1], unrotateSign(Record[2]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_ENUMERATOR: {
if (Record.size() != 3)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
getMDString(Record[2]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_BASIC_TYPE: {
if (Record.size() != 6)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIBasicType,
(Context, Record[1], getMDString(Record[2]), Record[3],
Record[4], Record[5])),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_DERIVED_TYPE: {
if (Record.size() != 12)
return error("Invalid record");
IsDistinct = Record[0];
DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
MetadataList.assignValue(
GET_OR_DISTINCT(DIDerivedType,
(Context, Record[1], getMDString(Record[2]),
getMDOrNull(Record[3]), Record[4],
getDITypeRefOrNull(Record[5]),
getDITypeRefOrNull(Record[6]), Record[7], Record[8],
Record[9], Flags, getDITypeRefOrNull(Record[11]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_COMPOSITE_TYPE: {
if (Record.size() != 16)
return error("Invalid record");
// If we have a UUID and this is not a forward declaration, lookup the
// mapping.
IsDistinct = Record[0] & 0x1;
bool IsNotUsedInTypeRef = Record[0] >= 2;
unsigned Tag = Record[1];
MDString *Name = getMDString(Record[2]);
Metadata *File = getMDOrNull(Record[3]);
unsigned Line = Record[4];
Metadata *Scope = getDITypeRefOrNull(Record[5]);
Metadata *BaseType = nullptr;
uint64_t SizeInBits = Record[7];
if (Record[8] > (uint64_t)std::numeric_limits<uint32_t>::max())
return error("Alignment value is too large");
uint32_t AlignInBits = Record[8];
uint64_t OffsetInBits = 0;
DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
Metadata *Elements = nullptr;
unsigned RuntimeLang = Record[12];
Metadata *VTableHolder = nullptr;
Metadata *TemplateParams = nullptr;
auto *Identifier = getMDString(Record[15]);
// If this module is being parsed so that it can be ThinLTO imported
// into another module, composite types only need to be imported
// as type declarations (unless full type definitions requested).
// Create type declarations up front to save memory. Also, buildODRType
// handles the case where this is type ODRed with a definition needed
// by the importing module, in which case the existing definition is
// used.
if (IsImporting && !ImportFullTypeDefinitions && Identifier &&
(Tag == dwarf::DW_TAG_enumeration_type ||
Tag == dwarf::DW_TAG_class_type ||
Tag == dwarf::DW_TAG_structure_type ||
Tag == dwarf::DW_TAG_union_type)) {
Flags = Flags | DINode::FlagFwdDecl;
} else {
BaseType = getDITypeRefOrNull(Record[6]);
OffsetInBits = Record[9];
Elements = getMDOrNull(Record[11]);
VTableHolder = getDITypeRefOrNull(Record[13]);
TemplateParams = getMDOrNull(Record[14]);
}
DICompositeType *CT = nullptr;
if (Identifier)
CT = DICompositeType::buildODRType(
Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
VTableHolder, TemplateParams);
// Create a node if we didn't get a lazy ODR type.
if (!CT)
CT = GET_OR_DISTINCT(DICompositeType,
(Context, Tag, Name, File, Line, Scope, BaseType,
SizeInBits, AlignInBits, OffsetInBits, Flags,
Elements, RuntimeLang, VTableHolder, TemplateParams,
Identifier));
if (!IsNotUsedInTypeRef && Identifier)
MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
MetadataList.assignValue(CT, NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_SUBROUTINE_TYPE: {
if (Record.size() < 3 || Record.size() > 4)
return error("Invalid record");
bool IsOldTypeRefArray = Record[0] < 2;
unsigned CC = (Record.size() > 3) ? Record[3] : 0;
IsDistinct = Record[0] & 0x1;
DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[1]);
Metadata *Types = getMDOrNull(Record[2]);
if (LLVM_UNLIKELY(IsOldTypeRefArray))
Types = MetadataList.upgradeTypeRefArray(Types);
MetadataList.assignValue(
GET_OR_DISTINCT(DISubroutineType, (Context, Flags, CC, Types)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_MODULE: {
if (Record.size() != 6)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIModule,
(Context, getMDOrNull(Record[1]),
getMDString(Record[2]), getMDString(Record[3]),
getMDString(Record[4]), getMDString(Record[5]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_FILE: {
if (Record.size() != 3 && Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(
DIFile,
(Context, getMDString(Record[1]), getMDString(Record[2]),
Record.size() == 3 ? DIFile::CSK_None
: static_cast<DIFile::ChecksumKind>(Record[3]),
Record.size() == 3 ? nullptr : getMDString(Record[4]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_COMPILE_UNIT: {
if (Record.size() < 14 || Record.size() > 17)
return error("Invalid record");
// Ignore Record[0], which indicates whether this compile unit is
// distinct. It's always distinct.
IsDistinct = true;
auto *CU = DICompileUnit::getDistinct(
Context, Record[1], getMDOrNull(Record[2]), getMDString(Record[3]),
Record[4], getMDString(Record[5]), Record[6], getMDString(Record[7]),
Record[8], getMDOrNull(Record[9]), getMDOrNull(Record[10]),
getMDOrNull(Record[12]), getMDOrNull(Record[13]),
Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
Record.size() <= 14 ? 0 : Record[14],
Record.size() <= 16 ? true : Record[16]);
MetadataList.assignValue(CU, NextMetadataNo);
NextMetadataNo++;
// Move the Upgrade the list of subprograms.
if (Metadata *SPs = getMDOrNullWithoutPlaceholders(Record[11]))
CUSubprograms.push_back({CU, SPs});
break;
}
case bitc::METADATA_SUBPROGRAM: {
if (Record.size() < 18 || Record.size() > 20)
return error("Invalid record");
IsDistinct =
(Record[0] & 1) || Record[8]; // All definitions should be distinct.
// Version 1 has a Function as Record[15].
// Version 2 has removed Record[15].
// Version 3 has the Unit as Record[15].
// Version 4 added thisAdjustment.
bool HasUnit = Record[0] >= 2;
if (HasUnit && Record.size() < 19)
return error("Invalid record");
Metadata *CUorFn = getMDOrNull(Record[15]);
unsigned Offset = Record.size() >= 19 ? 1 : 0;
bool HasFn = Offset && !HasUnit;
bool HasThisAdj = Record.size() >= 20;
DISubprogram *SP = GET_OR_DISTINCT(
DISubprogram, (Context,
getDITypeRefOrNull(Record[1]), // scope
getMDString(Record[2]), // name
getMDString(Record[3]), // linkageName
getMDOrNull(Record[4]), // file
Record[5], // line
getMDOrNull(Record[6]), // type
Record[7], // isLocal
Record[8], // isDefinition
Record[9], // scopeLine
getDITypeRefOrNull(Record[10]), // containingType
Record[11], // virtuality
Record[12], // virtualIndex
HasThisAdj ? Record[19] : 0, // thisAdjustment
static_cast<DINode::DIFlags>(Record[13] // flags
),
Record[14], // isOptimized
HasUnit ? CUorFn : nullptr, // unit
getMDOrNull(Record[15 + Offset]), // templateParams
getMDOrNull(Record[16 + Offset]), // declaration
getMDOrNull(Record[17 + Offset]) // variables
));
MetadataList.assignValue(SP, NextMetadataNo);
NextMetadataNo++;
// Upgrade sp->function mapping to function->sp mapping.
if (HasFn) {
if (auto *CMD = dyn_cast_or_null<ConstantAsMetadata>(CUorFn))
if (auto *F = dyn_cast<Function>(CMD->getValue())) {
if (F->isMaterializable())
// Defer until materialized; unmaterialized functions may not have
// metadata.
FunctionsWithSPs[F] = SP;
else if (!F->empty())
F->setSubprogram(SP);
}
}
break;
}
case bitc::METADATA_LEXICAL_BLOCK: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DILexicalBlock,
(Context, getMDOrNull(Record[1]),
getMDOrNull(Record[2]), Record[3], Record[4])),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_LEXICAL_BLOCK_FILE: {
if (Record.size() != 4)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DILexicalBlockFile,
(Context, getMDOrNull(Record[1]),
getMDOrNull(Record[2]), Record[3])),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_NAMESPACE: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0] & 1;
bool ExportSymbols = Record[0] & 2;
MetadataList.assignValue(
GET_OR_DISTINCT(DINamespace,
(Context, getMDOrNull(Record[1]),
getMDOrNull(Record[2]), getMDString(Record[3]),
Record[4], ExportSymbols)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_MACRO: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIMacro,
(Context, Record[1], Record[2], getMDString(Record[3]),
getMDString(Record[4]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_MACRO_FILE: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIMacroFile,
(Context, Record[1], Record[2], getMDOrNull(Record[3]),
getMDOrNull(Record[4]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_TEMPLATE_TYPE: {
if (Record.size() != 3)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
(Context, getMDString(Record[1]),
getDITypeRefOrNull(Record[2]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_TEMPLATE_VALUE: {
if (Record.size() != 5)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DITemplateValueParameter,
(Context, Record[1], getMDString(Record[2]),
getDITypeRefOrNull(Record[3]),
getMDOrNull(Record[4]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_GLOBAL_VAR: {
if (Record.size() < 11 || Record.size() > 12)
return error("Invalid record");
IsDistinct = Record[0] & 1;
unsigned Version = Record[0] >> 1;
if (Version == 1) {
MetadataList.assignValue(
GET_OR_DISTINCT(DIGlobalVariable,
(Context, getMDOrNull(Record[1]),
getMDString(Record[2]), getMDString(Record[3]),
getMDOrNull(Record[4]), Record[5],
getDITypeRefOrNull(Record[6]), Record[7], Record[8],
getMDOrNull(Record[10]), Record[11])),
NextMetadataNo);
NextMetadataNo++;
} else if (Version == 0) {
// Upgrade old metadata, which stored a global variable reference or a
// ConstantInt here.
Metadata *Expr = getMDOrNull(Record[9]);
uint32_t AlignInBits = 0;
if (Record.size() > 11) {
if (Record[11] > (uint64_t)std::numeric_limits<uint32_t>::max())
return error("Alignment value is too large");
AlignInBits = Record[11];
}
GlobalVariable *Attach = nullptr;
if (auto *CMD = dyn_cast_or_null<ConstantAsMetadata>(Expr)) {
if (auto *GV = dyn_cast<GlobalVariable>(CMD->getValue())) {
Attach = GV;
Expr = nullptr;
} else if (auto *CI = dyn_cast<ConstantInt>(CMD->getValue())) {
Expr = DIExpression::get(Context,
{dwarf::DW_OP_constu, CI->getZExtValue(),
dwarf::DW_OP_stack_value});
} else {
Expr = nullptr;
}
}
DIGlobalVariable *DGV = GET_OR_DISTINCT(
DIGlobalVariable,
(Context, getMDOrNull(Record[1]), getMDString(Record[2]),
getMDString(Record[3]), getMDOrNull(Record[4]), Record[5],
getDITypeRefOrNull(Record[6]), Record[7], Record[8],
getMDOrNull(Record[10]), AlignInBits));
- auto *DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
- MetadataList.assignValue(DGVE, NextMetadataNo);
- NextMetadataNo++;
+ DIGlobalVariableExpression *DGVE = nullptr;
+ if (Attach || Expr)
+ DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
+ else
+ NeedUpgradeToDIGlobalVariableExpression = true;
if (Attach)
Attach->addDebugInfo(DGVE);
+
+ auto *MDNode = Expr ? cast<Metadata>(DGVE) : cast<Metadata>(DGV);
+ MetadataList.assignValue(MDNode, NextMetadataNo);
+ NextMetadataNo++;
} else
return error("Invalid record");
break;
}
case bitc::METADATA_LOCAL_VAR: {
// 10th field is for the obseleted 'inlinedAt:' field.
if (Record.size() < 8 || Record.size() > 10)
return error("Invalid record");
IsDistinct = Record[0] & 1;
bool HasAlignment = Record[0] & 2;
// 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
// DW_TAG_arg_variable, if we have alignment flag encoded it means, that
// this is newer version of record which doesn't have artifical tag.
bool HasTag = !HasAlignment && Record.size() > 8;
DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
uint32_t AlignInBits = 0;
if (HasAlignment) {
if (Record[8 + HasTag] > (uint64_t)std::numeric_limits<uint32_t>::max())
return error("Alignment value is too large");
AlignInBits = Record[8 + HasTag];
}
MetadataList.assignValue(
GET_OR_DISTINCT(DILocalVariable,
(Context, getMDOrNull(Record[1 + HasTag]),
getMDString(Record[2 + HasTag]),
getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
getDITypeRefOrNull(Record[5 + HasTag]),
Record[6 + HasTag], Flags, AlignInBits)),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_EXPRESSION: {
if (Record.size() < 1)
return error("Invalid record");
IsDistinct = Record[0] & 1;
bool HasOpFragment = Record[0] & 2;
auto Elts = MutableArrayRef<uint64_t>(Record).slice(1);
if (!HasOpFragment)
if (unsigned N = Elts.size())
if (N >= 3 && Elts[N - 3] == dwarf::DW_OP_bit_piece)
Elts[N - 3] = dwarf::DW_OP_LLVM_fragment;
MetadataList.assignValue(
GET_OR_DISTINCT(DIExpression, (Context, makeArrayRef(Record).slice(1))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_GLOBAL_VAR_EXPR: {
if (Record.size() != 3)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(GET_OR_DISTINCT(DIGlobalVariableExpression,
(Context, getMDOrNull(Record[1]),
getMDOrNull(Record[2]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_OBJC_PROPERTY: {
if (Record.size() != 8)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIObjCProperty,
(Context, getMDString(Record[1]),
getMDOrNull(Record[2]), Record[3],
getMDString(Record[4]), getMDString(Record[5]),
Record[6], getDITypeRefOrNull(Record[7]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_IMPORTED_ENTITY: {
if (Record.size() != 6)
return error("Invalid record");
IsDistinct = Record[0];
MetadataList.assignValue(
GET_OR_DISTINCT(DIImportedEntity,
(Context, Record[1], getMDOrNull(Record[2]),
getDITypeRefOrNull(Record[3]), Record[4],
getMDString(Record[5]))),
NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_STRING_OLD: {
std::string String(Record.begin(), Record.end());
// Test for upgrading !llvm.loop.
HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String);
++NumMDStringLoaded;
Metadata *MD = MDString::get(Context, String);
MetadataList.assignValue(MD, NextMetadataNo);
NextMetadataNo++;
break;
}
case bitc::METADATA_STRINGS: {
auto CreateNextMDString = [&](StringRef Str) {
++NumMDStringLoaded;
MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo);
NextMetadataNo++;
};
if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString))
return Err;
break;
}
case bitc::METADATA_GLOBAL_DECL_ATTACHMENT: {
if (Record.size() % 2 == 0)
return error("Invalid record");
unsigned ValueID = Record[0];
if (ValueID >= ValueList.size())
return error("Invalid record");
if (auto *GO = dyn_cast<GlobalObject>(ValueList[ValueID]))
if (Error Err = parseGlobalObjectAttachment(
*GO, ArrayRef<uint64_t>(Record).slice(1)))
return Err;
break;
}
case bitc::METADATA_KIND: {
// Support older bitcode files that had METADATA_KIND records in a
// block with METADATA_BLOCK_ID.
if (Error Err = parseMetadataKindRecord(Record))
return Err;
break;
}
}
return Error::success();
#undef GET_OR_DISTINCT
}
Error MetadataLoader::MetadataLoaderImpl::parseMetadataStrings(
ArrayRef<uint64_t> Record, StringRef Blob,
std::function<void(StringRef)> CallBack) {
// All the MDStrings in the block are emitted together in a single
// record. The strings are concatenated and stored in a blob along with
// their sizes.
if (Record.size() != 2)
return error("Invalid record: metadata strings layout");
unsigned NumStrings = Record[0];
unsigned StringsOffset = Record[1];
if (!NumStrings)
return error("Invalid record: metadata strings with no strings");
if (StringsOffset > Blob.size())
return error("Invalid record: metadata strings corrupt offset");
StringRef Lengths = Blob.slice(0, StringsOffset);
SimpleBitstreamCursor R(Lengths);
StringRef Strings = Blob.drop_front(StringsOffset);
do {
if (R.AtEndOfStream())
return error("Invalid record: metadata strings bad length");
unsigned Size = R.ReadVBR(6);
if (Strings.size() < Size)
return error("Invalid record: metadata strings truncated chars");
CallBack(Strings.slice(0, Size));
Strings = Strings.drop_front(Size);
} while (--NumStrings);
return Error::success();
}
Error MetadataLoader::MetadataLoaderImpl::parseGlobalObjectAttachment(
GlobalObject &GO, ArrayRef<uint64_t> Record) {
assert(Record.size() % 2 == 0);
for (unsigned I = 0, E = Record.size(); I != E; I += 2) {
auto K = MDKindMap.find(Record[I]);
if (K == MDKindMap.end())
return error("Invalid ID");
MDNode *MD = MetadataList.getMDNodeFwdRefOrNull(Record[I + 1]);
if (!MD)
return error("Invalid metadata attachment");
GO.addMetadata(K->second, *MD);
}
return Error::success();
}
/// Parse metadata attachments.
Error MetadataLoader::MetadataLoaderImpl::parseMetadataAttachment(
Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
if (Stream.EnterSubBlock(bitc::METADATA_ATTACHMENT_ID))
return error("Invalid record");
SmallVector<uint64_t, 64> Record;
PlaceholderQueue Placeholders;
while (true) {
BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
switch (Entry.Kind) {
case BitstreamEntry::SubBlock: // Handled for us already.
case BitstreamEntry::Error:
return error("Malformed block");
case BitstreamEntry::EndBlock:
resolveForwardRefsAndPlaceholders(Placeholders);
return Error::success();
case BitstreamEntry::Record:
// The interesting case.
break;
}
// Read a metadata attachment record.
Record.clear();
++NumMDRecordLoaded;
switch (Stream.readRecord(Entry.ID, Record)) {
default: // Default behavior: ignore.
break;
case bitc::METADATA_ATTACHMENT: {
unsigned RecordLength = Record.size();
if (Record.empty())
return error("Invalid record");
if (RecordLength % 2 == 0) {
// A function attachment.
if (Error Err = parseGlobalObjectAttachment(F, Record))
return Err;
continue;
}
// An instruction attachment.
Instruction *Inst = InstructionList[Record[0]];
for (unsigned i = 1; i != RecordLength; i = i + 2) {
unsigned Kind = Record[i];
DenseMap<unsigned, unsigned>::iterator I = MDKindMap.find(Kind);
if (I == MDKindMap.end())
return error("Invalid ID");
if (I->second == LLVMContext::MD_tbaa && StripTBAA)
continue;
auto Idx = Record[i + 1];
if (Idx < (MDStringRef.size() + GlobalMetadataBitPosIndex.size()) &&
!MetadataList.lookup(Idx)) {
// Load the attachment if it is in the lazy-loadable range and hasn't
// been loaded yet.
lazyLoadOneMetadata(Idx, Placeholders);
resolveForwardRefsAndPlaceholders(Placeholders);
}
Metadata *Node = MetadataList.getMetadataFwdRef(Idx);
if (isa<LocalAsMetadata>(Node))
// Drop the attachment. This used to be legal, but there's no
// upgrade path.
break;
MDNode *MD = dyn_cast_or_null<MDNode>(Node);
if (!MD)
return error("Invalid metadata attachment");
if (HasSeenOldLoopTags && I->second == LLVMContext::MD_loop)
MD = upgradeInstructionLoopAttachment(*MD);
if (I->second == LLVMContext::MD_tbaa) {
assert(!MD->isTemporary() && "should load MDs before attachments");
MD = UpgradeTBAANode(*MD);
}
Inst->setMetadata(I->second, MD);
}
break;
}
}
}
}
/// Parse a single METADATA_KIND record, inserting result in MDKindMap.
Error MetadataLoader::MetadataLoaderImpl::parseMetadataKindRecord(
SmallVectorImpl<uint64_t> &Record) {
if (Record.size() < 2)
return error("Invalid record");
unsigned Kind = Record[0];
SmallString<8> Name(Record.begin() + 1, Record.end());
unsigned NewKind = TheModule.getMDKindID(Name.str());
if (!MDKindMap.insert(std::make_pair(Kind, NewKind)).second)
return error("Conflicting METADATA_KIND records");
return Error::success();
}
/// Parse the metadata kinds out of the METADATA_KIND_BLOCK.
Error MetadataLoader::MetadataLoaderImpl::parseMetadataKinds() {
if (Stream.EnterSubBlock(bitc::METADATA_KIND_BLOCK_ID))
return error("Invalid record");
SmallVector<uint64_t, 64> Record;
// Read all the records.
while (true) {
BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
switch (Entry.Kind) {
case BitstreamEntry::SubBlock: // Handled for us already.
case BitstreamEntry::Error:
return error("Malformed block");
case BitstreamEntry::EndBlock:
return Error::success();
case BitstreamEntry::Record:
// The interesting case.
break;
}
// Read a record.
Record.clear();
++NumMDRecordLoaded;
unsigned Code = Stream.readRecord(Entry.ID, Record);
switch (Code) {
default: // Default behavior: ignore.
break;
case bitc::METADATA_KIND: {
if (Error Err = parseMetadataKindRecord(Record))
return Err;
break;
}
}
}
}
MetadataLoader &MetadataLoader::operator=(MetadataLoader &&RHS) {
Pimpl = std::move(RHS.Pimpl);
return *this;
}
MetadataLoader::MetadataLoader(MetadataLoader &&RHS)
: Pimpl(std::move(RHS.Pimpl)) {}
MetadataLoader::~MetadataLoader() = default;
MetadataLoader::MetadataLoader(BitstreamCursor &Stream, Module &TheModule,
BitcodeReaderValueList &ValueList,
bool IsImporting,
std::function<Type *(unsigned)> getTypeByID)
: Pimpl(llvm::make_unique<MetadataLoaderImpl>(Stream, TheModule, ValueList,
getTypeByID, IsImporting)) {}
Error MetadataLoader::parseMetadata(bool ModuleLevel) {
return Pimpl->parseMetadata(ModuleLevel);
}
bool MetadataLoader::hasFwdRefs() const { return Pimpl->hasFwdRefs(); }
/// Return the given metadata, creating a replaceable forward reference if
/// necessary.
Metadata *MetadataLoader::getMetadataFwdRefOrLoad(unsigned Idx) {
return Pimpl->getMetadataFwdRefOrLoad(Idx);
}
MDNode *MetadataLoader::getMDNodeFwdRefOrNull(unsigned Idx) {
return Pimpl->getMDNodeFwdRefOrNull(Idx);
}
DISubprogram *MetadataLoader::lookupSubprogramForFunction(Function *F) {
return Pimpl->lookupSubprogramForFunction(F);
}
Error MetadataLoader::parseMetadataAttachment(
Function &F, const SmallVectorImpl<Instruction *> &InstructionList) {
return Pimpl->parseMetadataAttachment(F, InstructionList);
}
Error MetadataLoader::parseMetadataKinds() {
return Pimpl->parseMetadataKinds();
}
void MetadataLoader::setStripTBAA(bool StripTBAA) {
return Pimpl->setStripTBAA(StripTBAA);
}
bool MetadataLoader::isStrippingTBAA() { return Pimpl->isStrippingTBAA(); }
unsigned MetadataLoader::size() const { return Pimpl->size(); }
void MetadataLoader::shrinkTo(unsigned N) { return Pimpl->shrinkTo(N); }
Index: projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 313643)
@@ -1,1933 +1,1896 @@
//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass forwards branches to unconditional branches to make them branch
// directly to the target block. This pass often results in dead MBB's, which
// it then removes.
//
// Note that this pass must be run after register allocation, it cannot handle
// SSA form. It also must handle virtual registers for targets that emit virtual
// ISA (e.g. NVPTX).
//
//===----------------------------------------------------------------------===//
#include "BranchFolding.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
using namespace llvm;
#define DEBUG_TYPE "branchfolding"
STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
STATISTIC(NumBranchOpts, "Number of branches optimized");
STATISTIC(NumTailMerge , "Number of block tails merged");
STATISTIC(NumHoist , "Number of times common instructions are hoisted");
-STATISTIC(NumTailCalls, "Number of tail calls optimized");
static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
cl::init(cl::BOU_UNSET), cl::Hidden);
// Throttle for huge numbers of predecessors (compile speed problems)
static cl::opt<unsigned>
TailMergeThreshold("tail-merge-threshold",
cl::desc("Max number of predecessors to consider tail merging"),
cl::init(150), cl::Hidden);
// Heuristic for tail merging (and, inversely, tail duplication).
// TODO: This should be replaced with a target query.
static cl::opt<unsigned>
TailMergeSize("tail-merge-size",
cl::desc("Min number of instructions to consider tail merging"),
cl::init(3), cl::Hidden);
namespace {
/// BranchFolderPass - Wrap branch folder in a machine function pass.
class BranchFolderPass : public MachineFunctionPass {
public:
static char ID;
explicit BranchFolderPass(): MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineBlockFrequencyInfo>();
AU.addRequired<MachineBranchProbabilityInfo>();
AU.addRequired<TargetPassConfig>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
}
char BranchFolderPass::ID = 0;
char &llvm::BranchFolderPassID = BranchFolderPass::ID;
INITIALIZE_PASS(BranchFolderPass, "branch-folder",
"Control Flow Optimizer", false, false)
bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
// TailMerge can create jump into if branches that make CFG irreducible for
// HW that requires structurized CFG.
bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
PassConfig->getEnableTailMerge();
BranchFolder::MBFIWrapper MBBFreqInfo(
getAnalysis<MachineBlockFrequencyInfo>());
BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
getAnalysis<MachineBranchProbabilityInfo>());
return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
MF.getSubtarget().getRegisterInfo(),
getAnalysisIfAvailable<MachineModuleInfo>());
}
BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
MBFIWrapper &FreqInfo,
const MachineBranchProbabilityInfo &ProbInfo,
unsigned MinTailLength)
: EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
MBBFreqInfo(FreqInfo), MBPI(ProbInfo) {
if (MinCommonTailLength == 0)
MinCommonTailLength = TailMergeSize;
switch (FlagEnableTailMerge) {
case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
case cl::BOU_TRUE: EnableTailMerge = true; break;
case cl::BOU_FALSE: EnableTailMerge = false; break;
}
}
/// RemoveDeadBlock - Remove the specified dead machine basic block from the
/// function, updating the CFG.
void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
assert(MBB->pred_empty() && "MBB must be dead!");
DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
MachineFunction *MF = MBB->getParent();
// drop all successors.
while (!MBB->succ_empty())
MBB->removeSuccessor(MBB->succ_end()-1);
// Avoid matching if this pointer gets reused.
TriedMerging.erase(MBB);
// Remove the block.
MF->erase(MBB);
FuncletMembership.erase(MBB);
if (MLI)
MLI->removeBlock(MBB);
}
/// OptimizeFunction - Perhaps branch folding, tail merging and other
/// CFG optimizations on the given function. Block placement changes the layout
/// and may create new tail merging opportunities.
bool BranchFolder::OptimizeFunction(MachineFunction &MF,
const TargetInstrInfo *tii,
const TargetRegisterInfo *tri,
MachineModuleInfo *mmi,
MachineLoopInfo *mli, bool AfterPlacement) {
if (!tii) return false;
TriedMerging.clear();
AfterBlockPlacement = AfterPlacement;
TII = tii;
TRI = tri;
MMI = mmi;
MLI = mli;
MachineRegisterInfo &MRI = MF.getRegInfo();
UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
if (!UpdateLiveIns)
MRI.invalidateLiveness();
// Fix CFG. The later algorithms expect it to be right.
bool MadeChange = false;
for (MachineBasicBlock &MBB : MF) {
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
}
// Recalculate funclet membership.
FuncletMembership = getFuncletMembership(MF);
bool MadeChangeThisIteration = true;
while (MadeChangeThisIteration) {
MadeChangeThisIteration = TailMergeBlocks(MF);
// No need to clean up if tail merging does not change anything after the
// block placement.
if (!AfterBlockPlacement || MadeChangeThisIteration)
MadeChangeThisIteration |= OptimizeBranches(MF);
if (EnableHoistCommonCode)
MadeChangeThisIteration |= HoistCommonCode(MF);
MadeChange |= MadeChangeThisIteration;
}
// See if any jump tables have become dead as the code generator
// did its thing.
MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
if (!JTI)
return MadeChange;
// Walk the function to find jump tables that are live.
BitVector JTIsLive(JTI->getJumpTables().size());
for (const MachineBasicBlock &BB : MF) {
for (const MachineInstr &I : BB)
for (const MachineOperand &Op : I.operands()) {
if (!Op.isJTI()) continue;
// Remember that this JT is live.
JTIsLive.set(Op.getIndex());
}
}
// Finally, remove dead jump tables. This happens when the
// indirect jump was unreachable (and thus deleted).
for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
if (!JTIsLive.test(i)) {
JTI->RemoveJumpTable(i);
MadeChange = true;
}
return MadeChange;
}
//===----------------------------------------------------------------------===//
// Tail Merging of Blocks
//===----------------------------------------------------------------------===//
/// HashMachineInstr - Compute a hash value for MI and its operands.
static unsigned HashMachineInstr(const MachineInstr &MI) {
unsigned Hash = MI.getOpcode();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
const MachineOperand &Op = MI.getOperand(i);
// Merge in bits from the operand if easy. We can't use MachineOperand's
// hash_code here because it's not deterministic and we sort by hash value
// later.
unsigned OperandHash = 0;
switch (Op.getType()) {
case MachineOperand::MO_Register:
OperandHash = Op.getReg();
break;
case MachineOperand::MO_Immediate:
OperandHash = Op.getImm();
break;
case MachineOperand::MO_MachineBasicBlock:
OperandHash = Op.getMBB()->getNumber();
break;
case MachineOperand::MO_FrameIndex:
case MachineOperand::MO_ConstantPoolIndex:
case MachineOperand::MO_JumpTableIndex:
OperandHash = Op.getIndex();
break;
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ExternalSymbol:
// Global address / external symbol are too hard, don't bother, but do
// pull in the offset.
OperandHash = Op.getOffset();
break;
default:
break;
}
Hash += ((OperandHash << 3) | Op.getType()) << (i & 31);
}
return Hash;
}
/// HashEndOfMBB - Hash the last instruction in the MBB.
static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
if (I == MBB.end())
return 0;
return HashMachineInstr(*I);
}
/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
/// of instructions they actually have in common together at their end. Return
/// iterators for the first shared instruction in each block.
static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
MachineBasicBlock *MBB2,
MachineBasicBlock::iterator &I1,
MachineBasicBlock::iterator &I2) {
I1 = MBB1->end();
I2 = MBB2->end();
unsigned TailLen = 0;
while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
--I1; --I2;
// Skip debugging pseudos; necessary to avoid changing the code.
while (I1->isDebugValue()) {
if (I1==MBB1->begin()) {
while (I2->isDebugValue()) {
if (I2==MBB2->begin())
// I1==DBG at begin; I2==DBG at begin
return TailLen;
--I2;
}
++I2;
// I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
return TailLen;
}
--I1;
}
// I1==first (untested) non-DBG preceding known match
while (I2->isDebugValue()) {
if (I2==MBB2->begin()) {
++I1;
// I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
return TailLen;
}
--I2;
}
// I1, I2==first (untested) non-DBGs preceding known match
if (!I1->isIdenticalTo(*I2) ||
// FIXME: This check is dubious. It's used to get around a problem where
// people incorrectly expect inline asm directives to remain in the same
// relative order. This is untenable because normal compiler
// optimizations (like this one) may reorder and/or merge these
// directives.
I1->isInlineAsm()) {
++I1; ++I2;
break;
}
++TailLen;
}
// Back past possible debugging pseudos at beginning of block. This matters
// when one block differs from the other only by whether debugging pseudos
// are present at the beginning. (This way, the various checks later for
// I1==MBB1->begin() work as expected.)
if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
--I2;
while (I2->isDebugValue()) {
if (I2 == MBB2->begin())
return TailLen;
--I2;
}
++I2;
}
if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
--I1;
while (I1->isDebugValue()) {
if (I1 == MBB1->begin())
return TailLen;
--I1;
}
++I1;
}
return TailLen;
}
/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
/// after it, replacing it with an unconditional branch to NewDest.
void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
MachineBasicBlock *NewDest) {
TII->ReplaceTailWithBranchTo(OldInst, NewDest);
if (UpdateLiveIns) {
NewDest->clearLiveIns();
computeLiveIns(LiveRegs, *TRI, *NewDest);
}
++NumTailMerge;
}
/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
/// MBB so that the part before the iterator falls into the part starting at the
/// iterator. This returns the new MBB.
MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
MachineBasicBlock::iterator BBI1,
const BasicBlock *BB) {
if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
return nullptr;
MachineFunction &MF = *CurMBB.getParent();
// Create the fall-through block.
MachineFunction::iterator MBBI = CurMBB.getIterator();
MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
CurMBB.getParent()->insert(++MBBI, NewMBB);
// Move all the successors of this block to the specified block.
NewMBB->transferSuccessors(&CurMBB);
// Add an edge from CurMBB to NewMBB for the fall-through.
CurMBB.addSuccessor(NewMBB);
// Splice the code over.
NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
// NewMBB belongs to the same loop as CurMBB.
if (MLI)
if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
// NewMBB inherits CurMBB's block frequency.
MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
if (UpdateLiveIns)
computeLiveIns(LiveRegs, *TRI, *NewMBB);
// Add the new block to the funclet.
const auto &FuncletI = FuncletMembership.find(&CurMBB);
if (FuncletI != FuncletMembership.end()) {
auto n = FuncletI->second;
FuncletMembership[NewMBB] = n;
}
return NewMBB;
}
/// EstimateRuntime - Make a rough estimate for how long it will take to run
/// the specified code.
static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator E) {
unsigned Time = 0;
for (; I != E; ++I) {
if (I->isDebugValue())
continue;
if (I->isCall())
Time += 10;
else if (I->mayLoad() || I->mayStore())
Time += 2;
else
++Time;
}
return Time;
}
// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
// branches temporarily for tail merging). In the case where CurMBB ends
// with a conditional branch to the next block, optimize by reversing the
// test and conditionally branching to SuccMBB instead.
static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
const TargetInstrInfo *TII) {
MachineFunction *MF = CurMBB->getParent();
MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
DebugLoc dl; // FIXME: this is nowhere
if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
MachineBasicBlock *NextBB = &*I;
if (TBB == NextBB && !Cond.empty() && !FBB) {
if (!TII->reverseBranchCondition(Cond)) {
TII->removeBranch(*CurMBB);
TII->insertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
return;
}
}
}
TII->insertBranch(*CurMBB, SuccBB, nullptr,
SmallVector<MachineOperand, 0>(), dl);
}
bool
BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
if (getHash() < o.getHash())
return true;
if (getHash() > o.getHash())
return false;
if (getBlock()->getNumber() < o.getBlock()->getNumber())
return true;
if (getBlock()->getNumber() > o.getBlock()->getNumber())
return false;
// _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
// an object with itself.
#ifndef _GLIBCXX_DEBUG
llvm_unreachable("Predecessor appears twice");
#else
return false;
#endif
}
BlockFrequency
BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
auto I = MergedBBFreq.find(MBB);
if (I != MergedBBFreq.end())
return I->second;
return MBFI.getBlockFreq(MBB);
}
void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
BlockFrequency F) {
MergedBBFreq[MBB] = F;
}
raw_ostream &
BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
const MachineBasicBlock *MBB) const {
return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
}
raw_ostream &
BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
const BlockFrequency Freq) const {
return MBFI.printBlockFreq(OS, Freq);
}
/// CountTerminators - Count the number of terminators in the given
/// block and set I to the position of the first non-terminator, if there
/// is one, or MBB->end() otherwise.
static unsigned CountTerminators(MachineBasicBlock *MBB,
MachineBasicBlock::iterator &I) {
I = MBB->end();
unsigned NumTerms = 0;
for (;;) {
if (I == MBB->begin()) {
I = MBB->end();
break;
}
--I;
if (!I->isTerminator()) break;
++NumTerms;
}
return NumTerms;
}
/// ProfitableToMerge - Check if two machine basic blocks have a common tail
/// and decide if it would be profitable to merge those tails. Return the
/// length of the common tail and iterators to the first common instruction
/// in each block.
/// MBB1, MBB2 The blocks to check
/// MinCommonTailLength Minimum size of tail block to be merged.
/// CommonTailLen Out parameter to record the size of the shared tail between
/// MBB1 and MBB2
/// I1, I2 Iterator references that will be changed to point to the first
/// instruction in the common tail shared by MBB1,MBB2
/// SuccBB A common successor of MBB1, MBB2 which are in a canonical form
/// relative to SuccBB
/// PredBB The layout predecessor of SuccBB, if any.
/// FuncletMembership map from block to funclet #.
/// AfterPlacement True if we are merging blocks after layout. Stricter
/// thresholds apply to prevent undoing tail-duplication.
static bool
ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
unsigned MinCommonTailLength, unsigned &CommonTailLen,
MachineBasicBlock::iterator &I1,
MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
MachineBasicBlock *PredBB,
DenseMap<const MachineBasicBlock *, int> &FuncletMembership,
bool AfterPlacement) {
// It is never profitable to tail-merge blocks from two different funclets.
if (!FuncletMembership.empty()) {
auto Funclet1 = FuncletMembership.find(MBB1);
assert(Funclet1 != FuncletMembership.end());
auto Funclet2 = FuncletMembership.find(MBB2);
assert(Funclet2 != FuncletMembership.end());
if (Funclet1->second != Funclet2->second)
return false;
}
CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
if (CommonTailLen == 0)
return false;
DEBUG(dbgs() << "Common tail length of BB#" << MBB1->getNumber()
<< " and BB#" << MBB2->getNumber() << " is " << CommonTailLen
<< '\n');
// It's almost always profitable to merge any number of non-terminator
// instructions with the block that falls through into the common successor.
// This is true only for a single successor. For multiple successors, we are
// trading a conditional branch for an unconditional one.
// TODO: Re-visit successor size for non-layout tail merging.
if ((MBB1 == PredBB || MBB2 == PredBB) &&
(!AfterPlacement || MBB1->succ_size() == 1)) {
MachineBasicBlock::iterator I;
unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
if (CommonTailLen > NumTerms)
return true;
}
// If one of the blocks can be completely merged and happens to be in
// a position where the other could fall through into it, merge any number
// of instructions, because it can be done without a branch.
// TODO: If the blocks are not adjacent, move one of them so that they are?
if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
return true;
if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
return true;
// If both blocks have an unconditional branch temporarily stripped out,
// count that as an additional common instruction for the following
// heuristics. This heuristic is only accurate for single-succ blocks, so to
// make sure that during layout merging and duplicating don't crash, we check
// for that when merging during layout.
unsigned EffectiveTailLen = CommonTailLen;
if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
(MBB1->succ_size() == 1 || !AfterPlacement) &&
!MBB1->back().isBarrier() &&
!MBB2->back().isBarrier())
++EffectiveTailLen;
// Check if the common tail is long enough to be worthwhile.
if (EffectiveTailLen >= MinCommonTailLength)
return true;
// If we are optimizing for code size, 2 instructions in common is enough if
// we don't have to split a block. At worst we will be introducing 1 new
// branch instruction, which is likely to be smaller than the 2
// instructions that would be deleted in the merge.
MachineFunction *MF = MBB1->getParent();
return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() &&
(I1 == MBB1->begin() || I2 == MBB2->begin());
}
/// ComputeSameTails - Look through all the blocks in MergePotentials that have
/// hash CurHash (guaranteed to match the last element). Build the vector
/// SameTails of all those that have the (same) largest number of instructions
/// in common of any pair of these blocks. SameTails entries contain an
/// iterator into MergePotentials (from which the MachineBasicBlock can be
/// found) and a MachineBasicBlock::iterator into that MBB indicating the
/// instruction where the matching code sequence begins.
/// Order of elements in SameTails is the reverse of the order in which
/// those blocks appear in MergePotentials (where they are not necessarily
/// consecutive).
unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
unsigned MinCommonTailLength,
MachineBasicBlock *SuccBB,
MachineBasicBlock *PredBB) {
unsigned maxCommonTailLength = 0U;
SameTails.clear();
MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
MPIterator HighestMPIter = std::prev(MergePotentials.end());
for (MPIterator CurMPIter = std::prev(MergePotentials.end()),
B = MergePotentials.begin();
CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) {
for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) {
unsigned CommonTailLen;
if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
MinCommonTailLength,
CommonTailLen, TrialBBI1, TrialBBI2,
SuccBB, PredBB,
FuncletMembership,
AfterBlockPlacement)) {
if (CommonTailLen > maxCommonTailLength) {
SameTails.clear();
maxCommonTailLength = CommonTailLen;
HighestMPIter = CurMPIter;
SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
}
if (HighestMPIter == CurMPIter &&
CommonTailLen == maxCommonTailLength)
SameTails.push_back(SameTailElt(I, TrialBBI2));
}
if (I == B)
break;
}
}
return maxCommonTailLength;
}
/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
/// MergePotentials, restoring branches at ends of blocks as appropriate.
void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
MachineBasicBlock *SuccBB,
MachineBasicBlock *PredBB) {
MPIterator CurMPIter, B;
for (CurMPIter = std::prev(MergePotentials.end()),
B = MergePotentials.begin();
CurMPIter->getHash() == CurHash; --CurMPIter) {
// Put the unconditional branch back, if we need one.
MachineBasicBlock *CurMBB = CurMPIter->getBlock();
if (SuccBB && CurMBB != PredBB)
FixTail(CurMBB, SuccBB, TII);
if (CurMPIter == B)
break;
}
if (CurMPIter->getHash() != CurHash)
CurMPIter++;
MergePotentials.erase(CurMPIter, MergePotentials.end());
}
/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
/// only of the common tail. Create a block that does by splitting one.
bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
MachineBasicBlock *SuccBB,
unsigned maxCommonTailLength,
unsigned &commonTailIndex) {
commonTailIndex = 0;
unsigned TimeEstimate = ~0U;
for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
// Use PredBB if possible; that doesn't require a new branch.
if (SameTails[i].getBlock() == PredBB) {
commonTailIndex = i;
break;
}
// Otherwise, make a (fairly bogus) choice based on estimate of
// how long it will take the various blocks to execute.
unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
SameTails[i].getTailStartPos());
if (t <= TimeEstimate) {
TimeEstimate = t;
commonTailIndex = i;
}
}
MachineBasicBlock::iterator BBI =
SameTails[commonTailIndex].getTailStartPos();
MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
<< maxCommonTailLength);
// If the split block unconditionally falls-thru to SuccBB, it will be
// merged. In control flow terms it should then take SuccBB's name. e.g. If
// SuccBB is an inner loop, the common tail is still part of the inner loop.
const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
SuccBB->getBasicBlock() : MBB->getBasicBlock();
MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI, BB);
if (!newMBB) {
DEBUG(dbgs() << "... failed!");
return false;
}
SameTails[commonTailIndex].setBlock(newMBB);
SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
// If we split PredBB, newMBB is the new predecessor.
if (PredBB == MBB)
PredBB = newMBB;
return true;
}
static void
mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
MachineBasicBlock &MBBCommon) {
MachineBasicBlock *MBB = MBBIStartPos->getParent();
// Note CommonTailLen does not necessarily matches the size of
// the common BB nor all its instructions because of debug
// instructions differences.
unsigned CommonTailLen = 0;
for (auto E = MBB->end(); MBBIStartPos != E; ++MBBIStartPos)
++CommonTailLen;
MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin();
MachineBasicBlock::reverse_iterator MBBIE = MBB->rend();
MachineBasicBlock::reverse_iterator MBBICommon = MBBCommon.rbegin();
MachineBasicBlock::reverse_iterator MBBIECommon = MBBCommon.rend();
while (CommonTailLen--) {
assert(MBBI != MBBIE && "Reached BB end within common tail length!");
(void)MBBIE;
if (MBBI->isDebugValue()) {
++MBBI;
continue;
}
while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
++MBBICommon;
assert(MBBICommon != MBBIECommon &&
"Reached BB end within common tail length!");
assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");
// Merge MMOs from memory operations in the common block.
if (MBBICommon->mayLoad() || MBBICommon->mayStore())
MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
// Drop undef flags if they aren't present in all merged instructions.
for (unsigned I = 0, E = MBBICommon->getNumOperands(); I != E; ++I) {
MachineOperand &MO = MBBICommon->getOperand(I);
if (MO.isReg() && MO.isUndef()) {
const MachineOperand &OtherMO = MBBI->getOperand(I);
if (!OtherMO.isUndef())
MO.setIsUndef(false);
}
}
++MBBI;
++MBBICommon;
}
}
// See if any of the blocks in MergePotentials (which all have SuccBB as a
// successor, or all have no successor if it is null) can be tail-merged.
// If there is a successor, any blocks in MergePotentials that are not
// tail-merged and are not immediately before Succ must have an unconditional
// branch to Succ added (but the predecessor/successor lists need no
// adjustment). The lone predecessor of Succ that falls through into Succ,
// if any, is given in PredBB.
// MinCommonTailLength - Except for the special cases below, tail-merge if
// there are at least this many instructions in common.
bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
MachineBasicBlock *PredBB,
unsigned MinCommonTailLength) {
bool MadeChange = false;
DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
<< (i == e-1 ? "" : ", ");
dbgs() << "\n";
if (SuccBB) {
dbgs() << " with successor BB#" << SuccBB->getNumber() << '\n';
if (PredBB)
dbgs() << " which has fall-through from BB#"
<< PredBB->getNumber() << "\n";
}
dbgs() << "Looking for common tails of at least "
<< MinCommonTailLength << " instruction"
<< (MinCommonTailLength == 1 ? "" : "s") << '\n';
);
// Sort by hash value so that blocks with identical end sequences sort
// together.
array_pod_sort(MergePotentials.begin(), MergePotentials.end());
// Walk through equivalence sets looking for actual exact matches.
while (MergePotentials.size() > 1) {
unsigned CurHash = MergePotentials.back().getHash();
// Build SameTails, identifying the set of blocks with this hash code
// and with the maximum number of instructions in common.
unsigned maxCommonTailLength = ComputeSameTails(CurHash,
MinCommonTailLength,
SuccBB, PredBB);
// If we didn't find any pair that has at least MinCommonTailLength
// instructions in common, remove all blocks with this hash code and retry.
if (SameTails.empty()) {
RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
continue;
}
// If one of the blocks is the entire common tail (and not the entry
// block, which we can't jump to), we can treat all blocks with this same
// tail at once. Use PredBB if that is one of the possibilities, as that
// will not introduce any extra branches.
MachineBasicBlock *EntryBB =
&MergePotentials.front().getBlock()->getParent()->front();
unsigned commonTailIndex = SameTails.size();
// If there are two blocks, check to see if one can be made to fall through
// into the other.
if (SameTails.size() == 2 &&
SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
SameTails[1].tailIsWholeBlock())
commonTailIndex = 1;
else if (SameTails.size() == 2 &&
SameTails[1].getBlock()->isLayoutSuccessor(
SameTails[0].getBlock()) &&
SameTails[0].tailIsWholeBlock())
commonTailIndex = 0;
else {
// Otherwise just pick one, favoring the fall-through predecessor if
// there is one.
for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
MachineBasicBlock *MBB = SameTails[i].getBlock();
if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
continue;
if (MBB == PredBB) {
commonTailIndex = i;
break;
}
if (SameTails[i].tailIsWholeBlock())
commonTailIndex = i;
}
}
if (commonTailIndex == SameTails.size() ||
(SameTails[commonTailIndex].getBlock() == PredBB &&
!SameTails[commonTailIndex].tailIsWholeBlock())) {
// None of the blocks consist entirely of the common tail.
// Split a block so that one does.
if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
maxCommonTailLength, commonTailIndex)) {
RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
continue;
}
}
MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
// Recompute common tail MBB's edge weights and block frequency.
setCommonTailEdgeWeights(*MBB);
// Remove the original debug location from the common tail.
for (auto &MI : *MBB)
if (!MI.isDebugValue())
MI.setDebugLoc(DebugLoc());
// MBB is common tail. Adjust all other BB's to jump to this one.
// Traversal must be forwards so erases work.
DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
<< " for ");
for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
if (commonTailIndex == i)
continue;
DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
<< (i == e-1 ? "" : ", "));
// Merge operations (MMOs, undef flags)
mergeOperations(SameTails[i].getTailStartPos(), *MBB);
// Hack the end off BB i, making it jump to BB commonTailIndex instead.
ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
// BB i is no longer a predecessor of SuccBB; remove it from the worklist.
MergePotentials.erase(SameTails[i].getMPIter());
}
DEBUG(dbgs() << "\n");
// We leave commonTailIndex in the worklist in case there are other blocks
// that match it with a smaller number of instructions.
MadeChange = true;
}
return MadeChange;
}
bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
bool MadeChange = false;
if (!EnableTailMerge) return MadeChange;
// First find blocks with no successors.
// Block placement does not create new tail merging opportunities for these
// blocks.
if (!AfterBlockPlacement) {
MergePotentials.clear();
for (MachineBasicBlock &MBB : MF) {
if (MergePotentials.size() == TailMergeThreshold)
break;
if (!TriedMerging.count(&MBB) && MBB.succ_empty())
MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
}
// If this is a large problem, avoid visiting the same basic blocks
// multiple times.
if (MergePotentials.size() == TailMergeThreshold)
for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
TriedMerging.insert(MergePotentials[i].getBlock());
// See if we can do any tail merging on those.
if (MergePotentials.size() >= 2)
MadeChange |= TryTailMergeBlocks(nullptr, nullptr, MinCommonTailLength);
}
// Look at blocks (IBB) with multiple predecessors (PBB).
// We change each predecessor to a canonical form, by
// (1) temporarily removing any unconditional branch from the predecessor
// to IBB, and
// (2) alter conditional branches so they branch to the other block
// not IBB; this may require adding back an unconditional branch to IBB
// later, where there wasn't one coming in. E.g.
// Bcc IBB
// fallthrough to QBB
// here becomes
// Bncc QBB
// with a conceptual B to IBB after that, which never actually exists.
// With those changes, we see whether the predecessors' tails match,
// and merge them if so. We change things out of canonical form and
// back to the way they were later in the process. (OptimizeBranches
// would undo some of this, but we can't use it, because we'd get into
// a compile-time infinite loop repeatedly doing and undoing the same
// transformations.)
for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
I != E; ++I) {
if (I->pred_size() < 2) continue;
SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
MachineBasicBlock *IBB = &*I;
MachineBasicBlock *PredBB = &*std::prev(I);
MergePotentials.clear();
MachineLoop *ML;
// Bail if merging after placement and IBB is the loop header because
// -- If merging predecessors that belong to the same loop as IBB, the
// common tail of merged predecessors may become the loop top if block
// placement is called again and the predecessors may branch to this common
// tail and require more branches. This can be relaxed if
// MachineBlockPlacement::findBestLoopTop is more flexible.
// --If merging predecessors that do not belong to the same loop as IBB, the
// loop info of IBB's loop and the other loops may be affected. Calling the
// block placement again may make big change to the layout and eliminate the
// reason to do tail merging here.
if (AfterBlockPlacement && MLI) {
ML = MLI->getLoopFor(IBB);
if (ML && IBB == ML->getHeader())
continue;
}
for (MachineBasicBlock *PBB : I->predecessors()) {
if (MergePotentials.size() == TailMergeThreshold)
break;
if (TriedMerging.count(PBB))
continue;
// Skip blocks that loop to themselves, can't tail merge these.
if (PBB == IBB)
continue;
// Visit each predecessor only once.
if (!UniquePreds.insert(PBB).second)
continue;
// Skip blocks which may jump to a landing pad. Can't tail merge these.
if (PBB->hasEHPadSuccessor())
continue;
// After block placement, only consider predecessors that belong to the
// same loop as IBB. The reason is the same as above when skipping loop
// header.
if (AfterBlockPlacement && MLI)
if (ML != MLI->getLoopFor(PBB))
continue;
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
// Failing case: IBB is the target of a cbr, and we cannot reverse the
// branch.
SmallVector<MachineOperand, 4> NewCond(Cond);
if (!Cond.empty() && TBB == IBB) {
if (TII->reverseBranchCondition(NewCond))
continue;
// This is the QBB case described above
if (!FBB) {
auto Next = ++PBB->getIterator();
if (Next != MF.end())
FBB = &*Next;
}
}
// Failing case: the only way IBB can be reached from PBB is via
// exception handling. Happens for landing pads. Would be nice to have
// a bit in the edge so we didn't have to do all this.
if (IBB->isEHPad()) {
MachineFunction::iterator IP = ++PBB->getIterator();
MachineBasicBlock *PredNextBB = nullptr;
if (IP != MF.end())
PredNextBB = &*IP;
if (!TBB) {
if (IBB != PredNextBB) // fallthrough
continue;
} else if (FBB) {
if (TBB != IBB && FBB != IBB) // cbr then ubr
continue;
} else if (Cond.empty()) {
if (TBB != IBB) // ubr
continue;
} else {
if (TBB != IBB && IBB != PredNextBB) // cbr
continue;
}
}
// Remove the unconditional branch at the end, if any.
if (TBB && (Cond.empty() || FBB)) {
DebugLoc dl; // FIXME: this is nowhere
TII->removeBranch(*PBB);
if (!Cond.empty())
// reinsert conditional branch only, for now
TII->insertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
NewCond, dl);
}
MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
}
}
// If this is a large problem, avoid visiting the same basic blocks multiple
// times.
if (MergePotentials.size() == TailMergeThreshold)
for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
TriedMerging.insert(MergePotentials[i].getBlock());
if (MergePotentials.size() >= 2)
MadeChange |= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength);
// Reinsert an unconditional branch if needed. The 1 below can occur as a
// result of removing blocks in TryTailMergeBlocks.
PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
if (MergePotentials.size() == 1 &&
MergePotentials.begin()->getBlock() != PredBB)
FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
}
return MadeChange;
}
void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
BlockFrequency AccumulatedMBBFreq;
// Aggregate edge frequency of successor edge j:
// edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
// where bb is a basic block that is in SameTails.
for (const auto &Src : SameTails) {
const MachineBasicBlock *SrcMBB = Src.getBlock();
BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
AccumulatedMBBFreq += BlockFreq;
// It is not necessary to recompute edge weights if TailBB has less than two
// successors.
if (TailMBB.succ_size() <= 1)
continue;
auto EdgeFreq = EdgeFreqLs.begin();
for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
SuccI != SuccE; ++SuccI, ++EdgeFreq)
*EdgeFreq += BlockFreq * MBPI.getEdgeProbability(SrcMBB, *SuccI);
}
MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);
if (TailMBB.succ_size() <= 1)
return;
auto SumEdgeFreq =
std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
.getFrequency();
auto EdgeFreq = EdgeFreqLs.begin();
if (SumEdgeFreq > 0) {
for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
SuccI != SuccE; ++SuccI, ++EdgeFreq) {
auto Prob = BranchProbability::getBranchProbability(
EdgeFreq->getFrequency(), SumEdgeFreq);
TailMBB.setSuccProbability(SuccI, Prob);
}
}
}
//===----------------------------------------------------------------------===//
// Branch Optimization
//===----------------------------------------------------------------------===//
bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
bool MadeChange = false;
// Make sure blocks are numbered in order
MF.RenumberBlocks();
// Renumbering blocks alters funclet membership, recalculate it.
FuncletMembership = getFuncletMembership(MF);
for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
I != E; ) {
MachineBasicBlock *MBB = &*I++;
MadeChange |= OptimizeBlock(MBB);
// If it is dead, remove it.
if (MBB->pred_empty()) {
RemoveDeadBlock(MBB);
MadeChange = true;
++NumDeadBlocks;
}
}
return MadeChange;
}
// Blocks should be considered empty if they contain only debug info;
// else the debug info would affect codegen.
static bool IsEmptyBlock(MachineBasicBlock *MBB) {
return MBB->getFirstNonDebugInstr() == MBB->end();
}
// Blocks with only debug info and branches should be considered the same
// as blocks with only branches.
static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
MachineBasicBlock::iterator I = MBB->getFirstNonDebugInstr();
assert(I != MBB->end() && "empty block!");
return I->isBranch();
}
/// IsBetterFallthrough - Return true if it would be clearly better to
/// fall-through to MBB1 than to fall through into MBB2. This has to return
/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
/// result in infinite loops.
static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
MachineBasicBlock *MBB2) {
// Right now, we use a simple heuristic. If MBB2 ends with a call, and
// MBB1 doesn't, we prefer to fall through into MBB1. This allows us to
// optimize branches that branch to either a return block or an assert block
// into a fallthrough to the return.
MachineBasicBlock::iterator MBB1I = MBB1->getLastNonDebugInstr();
MachineBasicBlock::iterator MBB2I = MBB2->getLastNonDebugInstr();
if (MBB1I == MBB1->end() || MBB2I == MBB2->end())
return false;
// If there is a clear successor ordering we make sure that one block
// will fall through to the next
if (MBB1->isSuccessor(MBB2)) return true;
if (MBB2->isSuccessor(MBB1)) return false;
return MBB2I->isCall() && !MBB1I->isCall();
}
/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
/// instructions on the block.
static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
if (I != MBB.end() && I->isBranch())
return I->getDebugLoc();
return DebugLoc();
}
/// OptimizeBlock - Analyze and optimize control flow related to the specified
/// block. This is never called on the entry block.
bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
bool MadeChange = false;
MachineFunction &MF = *MBB->getParent();
ReoptimizeBlock:
MachineFunction::iterator FallThrough = MBB->getIterator();
++FallThrough;
// Make sure MBB and FallThrough belong to the same funclet.
bool SameFunclet = true;
if (!FuncletMembership.empty() && FallThrough != MF.end()) {
auto MBBFunclet = FuncletMembership.find(MBB);
assert(MBBFunclet != FuncletMembership.end());
auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
assert(FallThroughFunclet != FuncletMembership.end());
SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
}
// If this block is empty, make everyone use its fall-through, not the block
// explicitly. Landing pads should not do this since the landing-pad table
// points to this block. Blocks with their addresses taken shouldn't be
// optimized away.
if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
SameFunclet) {
// Dead block? Leave for cleanup later.
if (MBB->pred_empty()) return MadeChange;
if (FallThrough == MF.end()) {
// TODO: Simplify preds to not branch here if possible!
} else if (FallThrough->isEHPad()) {
// Don't rewrite to a landing pad fallthough. That could lead to the case
// where a BB jumps to more than one landing pad.
// TODO: Is it ever worth rewriting predecessors which don't already
// jump to a landing pad, and so can safely jump to the fallthrough?
} else if (MBB->isSuccessor(&*FallThrough)) {
// Rewrite all predecessors of the old block to go to the fallthrough
// instead.
while (!MBB->pred_empty()) {
MachineBasicBlock *Pred = *(MBB->pred_end()-1);
Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
}
// If MBB was the target of a jump table, update jump tables to go to the
// fallthrough instead.
if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
MadeChange = true;
}
return MadeChange;
}
// Check to see if we can simplify the terminator of the block before this
// one.
MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));
MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
SmallVector<MachineOperand, 4> PriorCond;
bool PriorUnAnalyzable =
TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
if (!PriorUnAnalyzable) {
// If the CFG for the prior block has extra edges, remove them.
MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
!PriorCond.empty());
// If the previous branch is conditional and both conditions go to the same
// destination, remove the branch, replacing it with an unconditional one or
// a fall-through.
if (PriorTBB && PriorTBB == PriorFBB) {
DebugLoc dl = getBranchDebugLoc(PrevBB);
TII->removeBranch(PrevBB);
PriorCond.clear();
if (PriorTBB != MBB)
TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
MadeChange = true;
++NumBranchOpts;
goto ReoptimizeBlock;
}
// If the previous block unconditionally falls through to this block and
// this block has no other predecessors, move the contents of this block
// into the prior block. This doesn't usually happen when SimplifyCFG
// has been used, but it can happen if tail merging splits a fall-through
// predecessor of a block.
// This has to check PrevBB->succ_size() because EH edges are ignored by
// AnalyzeBranch.
if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
PrevBB.succ_size() == 1 &&
!MBB->hasAddressTaken() && !MBB->isEHPad()) {
DEBUG(dbgs() << "\nMerging into block: " << PrevBB
<< "From MBB: " << *MBB);
// Remove redundant DBG_VALUEs first.
if (PrevBB.begin() != PrevBB.end()) {
MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
--PrevBBIter;
MachineBasicBlock::iterator MBBIter = MBB->begin();
// Check if DBG_VALUE at the end of PrevBB is identical to the
// DBG_VALUE at the beginning of MBB.
while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
&& PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
if (!MBBIter->isIdenticalTo(*PrevBBIter))
break;
MachineInstr &DuplicateDbg = *MBBIter;
++MBBIter; -- PrevBBIter;
DuplicateDbg.eraseFromParent();
}
}
PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
PrevBB.removeSuccessor(PrevBB.succ_begin());
assert(PrevBB.succ_empty());
PrevBB.transferSuccessors(MBB);
MadeChange = true;
return MadeChange;
}
// If the previous branch *only* branches to *this* block (conditional or
// not) remove the branch.
if (PriorTBB == MBB && !PriorFBB) {
TII->removeBranch(PrevBB);
MadeChange = true;
++NumBranchOpts;
goto ReoptimizeBlock;
}
// If the prior block branches somewhere else on the condition and here if
// the condition is false, remove the uncond second branch.
if (PriorFBB == MBB) {
DebugLoc dl = getBranchDebugLoc(PrevBB);
TII->removeBranch(PrevBB);
TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
MadeChange = true;
++NumBranchOpts;
goto ReoptimizeBlock;
}
// If the prior block branches here on true and somewhere else on false, and
// if the branch condition is reversible, reverse the branch to create a
// fall-through.
if (PriorTBB == MBB) {
SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
if (!TII->reverseBranchCondition(NewPriorCond)) {
DebugLoc dl = getBranchDebugLoc(PrevBB);
TII->removeBranch(PrevBB);
TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
MadeChange = true;
++NumBranchOpts;
goto ReoptimizeBlock;
}
}
// If this block has no successors (e.g. it is a return block or ends with
// a call to a no-return function like abort or __cxa_throw) and if the pred
// falls through into this block, and if it would otherwise fall through
// into the block after this, move this block to the end of the function.
//
// We consider it more likely that execution will stay in the function (e.g.
// due to loops) than it is to exit it. This asserts in loops etc, moving
// the assert condition out of the loop body.
if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
MachineFunction::iterator(PriorTBB) == FallThrough &&
!MBB->canFallThrough()) {
bool DoTransform = true;
// We have to be careful that the succs of PredBB aren't both no-successor
// blocks. If neither have successors and if PredBB is the second from
// last block in the function, we'd just keep swapping the two blocks for
// last. Only do the swap if one is clearly better to fall through than
// the other.
if (FallThrough == --MF.end() &&
!IsBetterFallthrough(PriorTBB, MBB))
DoTransform = false;
if (DoTransform) {
// Reverse the branch so we will fall through on the previous true cond.
SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
if (!TII->reverseBranchCondition(NewPriorCond)) {
DEBUG(dbgs() << "\nMoving MBB: " << *MBB
<< "To make fallthrough to: " << *PriorTBB << "\n");
DebugLoc dl = getBranchDebugLoc(PrevBB);
TII->removeBranch(PrevBB);
TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);
// Move this block to the end of the function.
MBB->moveAfter(&MF.back());
MadeChange = true;
++NumBranchOpts;
return MadeChange;
}
}
- }
- }
-
- if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
- MF.getFunction()->optForSize()) {
- // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
- // direction, thereby defeating careful block placement and regressing
- // performance. Therefore, only consider this for optsize functions.
- MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
- if (TII->isUnconditionalTailCall(TailCall)) {
- MachineBasicBlock *Pred = *MBB->pred_begin();
- MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
- SmallVector<MachineOperand, 4> PredCond;
- bool PredAnalyzable =
- !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);
-
- if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) {
- // The predecessor has a conditional branch to this block which consists
- // of only a tail call. Try to fold the tail call into the conditional
- // branch.
- if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
- // TODO: It would be nice if analyzeBranch() could provide a pointer
- // to the branch insturction so replaceBranchWithTailCall() doesn't
- // have to search for it.
- TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
- ++NumTailCalls;
- Pred->removeSuccessor(MBB);
- MadeChange = true;
- return MadeChange;
- }
- }
- // If the predecessor is falling through to this block, we could reverse
- // the branch condition and fold the tail call into that. However, after
- // that we might have to re-arrange the CFG to fall through to the other
- // block and there is a high risk of regressing code size rather than
- // improving it.
}
}
// Analyze the branch in the current block.
MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
SmallVector<MachineOperand, 4> CurCond;
bool CurUnAnalyzable =
TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
if (!CurUnAnalyzable) {
// If the CFG for the prior block has extra edges, remove them.
MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
// If this is a two-way branch, and the FBB branches to this block, reverse
// the condition so the single-basic-block loop is faster. Instead of:
// Loop: xxx; jcc Out; jmp Loop
// we want:
// Loop: xxx; jncc Loop; jmp Out
if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
SmallVector<MachineOperand, 4> NewCond(CurCond);
if (!TII->reverseBranchCondition(NewCond)) {
DebugLoc dl = getBranchDebugLoc(*MBB);
TII->removeBranch(*MBB);
TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
MadeChange = true;
++NumBranchOpts;
goto ReoptimizeBlock;
}
}
// If this branch is the only thing in its block, see if we can forward
// other blocks across it.
if (CurTBB && CurCond.empty() && !CurFBB &&
IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
!MBB->hasAddressTaken() && !MBB->isEHPad()) {
DebugLoc dl = getBranchDebugLoc(*MBB);
// This block may contain just an unconditional branch. Because there can
// be 'non-branch terminators' in the block, try removing the branch and
// then seeing if the block is empty.
TII->removeBranch(*MBB);
// If the only things remaining in the block are debug info, remove these
// as well, so this will behave the same as an empty block in non-debug
// mode.
if (IsEmptyBlock(MBB)) {
// Make the block empty, losing the debug info (we could probably
// improve this in some cases.)
MBB->erase(MBB->begin(), MBB->end());
}
// If this block is just an unconditional branch to CurTBB, we can
// usually completely eliminate the block. The only case we cannot
// completely eliminate the block is when the block before this one
// falls through into MBB and we can't understand the prior block's branch
// condition.
if (MBB->empty()) {
bool PredHasNoFallThrough = !PrevBB.canFallThrough();
if (PredHasNoFallThrough || !PriorUnAnalyzable ||
!PrevBB.isSuccessor(MBB)) {
// If the prior block falls through into us, turn it into an
// explicit branch to us to make updates simpler.
if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
PriorTBB != MBB && PriorFBB != MBB) {
if (!PriorTBB) {
assert(PriorCond.empty() && !PriorFBB &&
"Bad branch analysis");
PriorTBB = MBB;
} else {
assert(!PriorFBB && "Machine CFG out of date!");
PriorFBB = MBB;
}
DebugLoc pdl = getBranchDebugLoc(PrevBB);
TII->removeBranch(PrevBB);
TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
}
// Iterate through all the predecessors, revectoring each in-turn.
size_t PI = 0;
bool DidChange = false;
bool HasBranchToSelf = false;
while(PI != MBB->pred_size()) {
MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
if (PMBB == MBB) {
// If this block has an uncond branch to itself, leave it.
++PI;
HasBranchToSelf = true;
} else {
DidChange = true;
PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
// If this change resulted in PMBB ending in a conditional
// branch where both conditions go to the same destination,
// change this to an unconditional branch (and fix the CFG).
MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
SmallVector<MachineOperand, 4> NewCurCond;
bool NewCurUnAnalyzable = TII->analyzeBranch(
*PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
DebugLoc pdl = getBranchDebugLoc(*PMBB);
TII->removeBranch(*PMBB);
NewCurCond.clear();
TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
MadeChange = true;
++NumBranchOpts;
PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
}
}
}
// Change any jumptables to go to the new MBB.
if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
if (DidChange) {
++NumBranchOpts;
MadeChange = true;
if (!HasBranchToSelf) return MadeChange;
}
}
}
// Add the branch back if the block is more than just an uncond branch.
TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
}
}
// If the prior block doesn't fall through into this block, and if this
// block doesn't fall through into some other block, see if we can find a
// place to move this block where a fall-through will happen.
if (!PrevBB.canFallThrough()) {
// Now we know that there was no fall-through into this block, check to
// see if it has a fall-through into its successor.
bool CurFallsThru = MBB->canFallThrough();
if (!MBB->isEHPad()) {
// Check all the predecessors of this block. If one of them has no fall
// throughs, move this block right after it.
for (MachineBasicBlock *PredBB : MBB->predecessors()) {
// Analyze the branch at the end of the pred.
MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
SmallVector<MachineOperand, 4> PredCond;
if (PredBB != MBB && !PredBB->canFallThrough() &&
!TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
(!CurFallsThru || !CurTBB || !CurFBB) &&
(!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
// If the current block doesn't fall through, just move it.
// If the current block can fall through and does not end with a
// conditional branch, we need to append an unconditional jump to
// the (current) next block. To avoid a possible compile-time
// infinite loop, move blocks only backward in this case.
// Also, if there are already 2 branches here, we cannot add a third;
// this means we have the case
// Bcc next
// B elsewhere
// next:
if (CurFallsThru) {
MachineBasicBlock *NextBB = &*std::next(MBB->getIterator());
CurCond.clear();
TII->insertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
}
MBB->moveAfter(PredBB);
MadeChange = true;
goto ReoptimizeBlock;
}
}
}
if (!CurFallsThru) {
// Check all successors to see if we can move this block before it.
for (MachineBasicBlock *SuccBB : MBB->successors()) {
// Analyze the branch at the end of the block before the succ.
MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
// If this block doesn't already fall-through to that successor, and if
// the succ doesn't already have a block that can fall through into it,
// and if the successor isn't an EH destination, we can arrange for the
// fallthrough to happen.
if (SuccBB != MBB && &*SuccPrev != MBB &&
!SuccPrev->canFallThrough() && !CurUnAnalyzable &&
!SuccBB->isEHPad()) {
MBB->moveBefore(SuccBB);
MadeChange = true;
goto ReoptimizeBlock;
}
}
// Okay, there is no really great place to put this block. If, however,
// the block before this one would be a fall-through if this block were
// removed, move this block to the end of the function. There is no real
// advantage in "falling through" to an EH block, so we don't want to
// perform this transformation for that case.
//
// Also, Windows EH introduced the possibility of an arbitrary number of
// successors to a given block. The analyzeBranch call does not consider
// exception handling and so we can get in a state where a block
// containing a call is followed by multiple EH blocks that would be
// rotated infinitely at the end of the function if the transformation
// below were performed for EH "FallThrough" blocks. Therefore, even if
// that appears not to be happening anymore, we should assume that it is
// possible and not remove the "!FallThrough()->isEHPad" condition below.
MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
SmallVector<MachineOperand, 4> PrevCond;
if (FallThrough != MF.end() &&
!FallThrough->isEHPad() &&
!TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
PrevBB.isSuccessor(&*FallThrough)) {
MBB->moveAfter(&MF.back());
MadeChange = true;
return MadeChange;
}
}
}
return MadeChange;
}
//===----------------------------------------------------------------------===//
// Hoist Common Code
//===----------------------------------------------------------------------===//
/// HoistCommonCode - Hoist common instruction sequences at the start of basic
/// blocks to their common predecessor.
bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
bool MadeChange = false;
for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
MachineBasicBlock *MBB = &*I++;
MadeChange |= HoistCommonCodeInSuccs(MBB);
}
return MadeChange;
}
/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
/// its 'true' successor.
static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
MachineBasicBlock *TrueBB) {
for (MachineBasicBlock *SuccBB : BB->successors())
if (SuccBB != TrueBB)
return SuccBB;
return nullptr;
}
template <class Container>
static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
Container &Set) {
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
Set.insert(*AI);
} else {
Set.insert(Reg);
}
}
/// findHoistingInsertPosAndDeps - Find the location to move common instructions
/// in successors to. The location is usually just before the terminator,
/// however if the terminator is a conditional branch and its previous
/// instruction is the flag setting instruction, the previous instruction is
/// the preferred location. This function also gathers uses and defs of the
/// instructions from the insertion point to the end of the block. The data is
/// used by HoistCommonCodeInSuccs to ensure safety.
static
MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
const TargetInstrInfo *TII,
const TargetRegisterInfo *TRI,
SmallSet<unsigned,4> &Uses,
SmallSet<unsigned,4> &Defs) {
MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
if (!TII->isUnpredicatedTerminator(*Loc))
return MBB->end();
for (const MachineOperand &MO : Loc->operands()) {
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (MO.isUse()) {
addRegAndItsAliases(Reg, TRI, Uses);
} else {
if (!MO.isDead())
// Don't try to hoist code in the rare case the terminator defines a
// register that is later used.
return MBB->end();
// If the terminator defines a register, make sure we don't hoist
// the instruction whose def might be clobbered by the terminator.
addRegAndItsAliases(Reg, TRI, Defs);
}
}
if (Uses.empty())
return Loc;
if (Loc == MBB->begin())
return MBB->end();
// The terminator is probably a conditional branch, try not to separate the
// branch from condition setting instruction.
MachineBasicBlock::iterator PI =
skipDebugInstructionsBackward(std::prev(Loc), MBB->begin());
bool IsDef = false;
for (const MachineOperand &MO : PI->operands()) {
// If PI has a regmask operand, it is probably a call. Separate away.
if (MO.isRegMask())
return Loc;
if (!MO.isReg() || MO.isUse())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (Uses.count(Reg)) {
IsDef = true;
break;
}
}
if (!IsDef)
// The condition setting instruction is not just before the conditional
// branch.
return Loc;
// Be conservative, don't insert instruction above something that may have
// side-effects. And since it's potentially bad to separate flag setting
// instruction from the conditional branch, just abort the optimization
// completely.
// Also avoid moving code above predicated instruction since it's hard to
// reason about register liveness with predicated instruction.
bool DontMoveAcrossStore = true;
if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
return MBB->end();
// Find out what registers are live. Note this routine is ignoring other live
// registers which are only used by instructions in successor blocks.
for (const MachineOperand &MO : PI->operands()) {
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (MO.isUse()) {
addRegAndItsAliases(Reg, TRI, Uses);
} else {
if (Uses.erase(Reg)) {
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
Uses.erase(*SubRegs); // Use sub-registers to be conservative
}
}
addRegAndItsAliases(Reg, TRI, Defs);
}
}
return PI;
}
/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
/// sequence at the start of the function, move the instructions before MBB
/// terminator if it's legal.
bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
SmallVector<MachineOperand, 4> Cond;
if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
return false;
if (!FBB) FBB = findFalseBlock(MBB, TBB);
if (!FBB)
// Malformed bcc? True and false blocks are the same?
return false;
// Restrict the optimization to cases where MBB is the only predecessor,
// it is an obvious win.
if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
return false;
// Find a suitable position to hoist the common instructions to. Also figure
// out which registers are used or defined by instructions from the insertion
// point to the end of the block.
SmallSet<unsigned, 4> Uses, Defs;
MachineBasicBlock::iterator Loc =
findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
if (Loc == MBB->end())
return false;
bool HasDups = false;
SmallVector<unsigned, 4> LocalDefs;
SmallSet<unsigned, 4> LocalDefsSet;
MachineBasicBlock::iterator TIB = TBB->begin();
MachineBasicBlock::iterator FIB = FBB->begin();
MachineBasicBlock::iterator TIE = TBB->end();
MachineBasicBlock::iterator FIE = FBB->end();
while (TIB != TIE && FIB != FIE) {
// Skip dbg_value instructions. These do not count.
TIB = skipDebugInstructionsForward(TIB, TIE);
FIB = skipDebugInstructionsForward(FIB, FIE);
if (TIB == TIE || FIB == FIE)
break;
if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
break;
if (TII->isPredicated(*TIB))
// Hard to reason about register liveness with predicated instruction.
break;
bool IsSafe = true;
for (MachineOperand &MO : TIB->operands()) {
// Don't attempt to hoist instructions with register masks.
if (MO.isRegMask()) {
IsSafe = false;
break;
}
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
if (MO.isDef()) {
if (Uses.count(Reg)) {
// Avoid clobbering a register that's used by the instruction at
// the point of insertion.
IsSafe = false;
break;
}
if (Defs.count(Reg) && !MO.isDead()) {
// Don't hoist the instruction if the def would be clobber by the
// instruction at the point insertion. FIXME: This is overly
// conservative. It should be possible to hoist the instructions
// in BB2 in the following example:
// BB1:
// r1, eflag = op1 r2, r3
// brcc eflag
//
// BB2:
// r1 = op2, ...
// = op3, r1<kill>
IsSafe = false;
break;
}
} else if (!LocalDefsSet.count(Reg)) {
if (Defs.count(Reg)) {
// Use is defined by the instruction at the point of insertion.
IsSafe = false;
break;
}
if (MO.isKill() && Uses.count(Reg))
// Kills a register that's read by the instruction at the point of
// insertion. Remove the kill marker.
MO.setIsKill(false);
}
}
if (!IsSafe)
break;
bool DontMoveAcrossStore = true;
if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
break;
// Remove kills from LocalDefsSet, these registers had short live ranges.
for (const MachineOperand &MO : TIB->operands()) {
if (!MO.isReg() || !MO.isUse() || !MO.isKill())
continue;
unsigned Reg = MO.getReg();
if (!Reg || !LocalDefsSet.count(Reg))
continue;
if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
LocalDefsSet.erase(*AI);
} else {
LocalDefsSet.erase(Reg);
}
}
// Track local defs so we can update liveins.
for (const MachineOperand &MO : TIB->operands()) {
if (!MO.isReg() || !MO.isDef() || MO.isDead())
continue;
unsigned Reg = MO.getReg();
if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
continue;
LocalDefs.push_back(Reg);
addRegAndItsAliases(Reg, TRI, LocalDefsSet);
}
HasDups = true;
++TIB;
++FIB;
}
if (!HasDups)
return false;
MBB->splice(Loc, TBB, TBB->begin(), TIB);
FBB->erase(FBB->begin(), FIB);
// Update livein's.
bool AddedLiveIns = false;
for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
unsigned Def = LocalDefs[i];
if (LocalDefsSet.count(Def)) {
TBB->addLiveIn(Def);
FBB->addLiveIn(Def);
AddedLiveIns = true;
}
}
if (AddedLiveIns) {
TBB->sortUniqueLiveIns();
FBB->sortUniqueLiveIns();
}
++NumHoist;
return true;
}
Index: projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp (revision 313643)
@@ -1,371 +1,385 @@
//===- MachineCopyPropagation.cpp - Machine Copy Propagation Pass ---------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This is an extremely simple MachineInstr-level copy propagation pass.
//
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/Passes.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
using namespace llvm;
#define DEBUG_TYPE "codegen-cp"
STATISTIC(NumDeletes, "Number of dead copies deleted");
namespace {
typedef SmallVector<unsigned, 4> RegList;
typedef DenseMap<unsigned, RegList> SourceMap;
typedef DenseMap<unsigned, MachineInstr*> Reg2MIMap;
class MachineCopyPropagation : public MachineFunctionPass {
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
const MachineRegisterInfo *MRI;
public:
static char ID; // Pass identification, replacement for typeid
MachineCopyPropagation() : MachineFunctionPass(ID) {
initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
bool runOnMachineFunction(MachineFunction &MF) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
private:
void ClobberRegister(unsigned Reg);
+ void ReadRegister(unsigned Reg);
void CopyPropagateBlock(MachineBasicBlock &MBB);
bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
/// Candidates for deletion.
SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
/// Def -> available copies map.
Reg2MIMap AvailCopyMap;
/// Def -> copies map.
Reg2MIMap CopyMap;
/// Src -> Def map
SourceMap SrcMap;
bool Changed;
};
}
char MachineCopyPropagation::ID = 0;
char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
"Machine Copy Propagation Pass", false, false)
/// Remove any entry in \p Map where the register is a subregister or equal to
/// a register contained in \p Regs.
static void removeRegsFromMap(Reg2MIMap &Map, const RegList &Regs,
const TargetRegisterInfo &TRI) {
for (unsigned Reg : Regs) {
// Source of copy is no longer available for propagation.
for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
Map.erase(*SR);
}
}
/// Remove any entry in \p Map that is marked clobbered in \p RegMask.
/// The map will typically have a lot fewer entries than the regmask clobbers,
/// so this is more efficient than iterating the clobbered registers and calling
/// ClobberRegister() on them.
static void removeClobberedRegsFromMap(Reg2MIMap &Map,
const MachineOperand &RegMask) {
for (Reg2MIMap::iterator I = Map.begin(), E = Map.end(), Next; I != E;
I = Next) {
Next = std::next(I);
unsigned Reg = I->first;
if (RegMask.clobbersPhysReg(Reg))
Map.erase(I);
}
}
void MachineCopyPropagation::ClobberRegister(unsigned Reg) {
for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
CopyMap.erase(*AI);
AvailCopyMap.erase(*AI);
SourceMap::iterator SI = SrcMap.find(*AI);
if (SI != SrcMap.end()) {
removeRegsFromMap(AvailCopyMap, SI->second, *TRI);
SrcMap.erase(SI);
}
}
}
+void MachineCopyPropagation::ReadRegister(unsigned Reg) {
+ // If 'Reg' is defined by a copy, the copy is no longer a candidate
+ // for elimination.
+ for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+ Reg2MIMap::iterator CI = CopyMap.find(*AI);
+ if (CI != CopyMap.end()) {
+ DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
+ MaybeDeadCopies.remove(CI->second);
+ }
+ }
+}
+
/// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
/// This fact may have been obscured by sub register usage or may not be true at
/// all even though Src and Def are subregisters of the registers used in
/// PreviousCopy. e.g.
/// isNopCopy("ecx = COPY eax", AX, CX) == true
/// isNopCopy("ecx = COPY eax", AH, CL) == false
static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
unsigned Def, const TargetRegisterInfo *TRI) {
unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
if (Src == PreviousSrc) {
assert(Def == PreviousDef);
return true;
}
if (!TRI->isSubRegister(PreviousSrc, Src))
return false;
unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
}
/// Remove instruction \p Copy if there exists a previous copy that copies the
/// register \p Src to the register \p Def; This may happen indirectly by
/// copying the super registers.
bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
unsigned Def) {
// Avoid eliminating a copy from/to a reserved registers as we cannot predict
// the value (Example: The sparc zero register is writable but stays zero).
if (MRI->isReserved(Src) || MRI->isReserved(Def))
return false;
// Search for an existing copy.
Reg2MIMap::iterator CI = AvailCopyMap.find(Def);
if (CI == AvailCopyMap.end())
return false;
// Check that the existing copy uses the correct sub registers.
MachineInstr &PrevCopy = *CI->second;
if (!isNopCopy(PrevCopy, Src, Def, TRI))
return false;
DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
// Copy was redundantly redefining either Src or Def. Remove earlier kill
// flags between Copy and PrevCopy because the value will be reused now.
assert(Copy.isCopy());
unsigned CopyDef = Copy.getOperand(0).getReg();
assert(CopyDef == Src || CopyDef == Def);
for (MachineInstr &MI :
make_range(PrevCopy.getIterator(), Copy.getIterator()))
MI.clearRegisterKills(CopyDef, TRI);
Copy.eraseFromParent();
Changed = true;
++NumDeletes;
return true;
}
void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
MachineInstr *MI = &*I;
++I;
if (MI->isCopy()) {
unsigned Def = MI->getOperand(0).getReg();
unsigned Src = MI->getOperand(1).getReg();
assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
!TargetRegisterInfo::isVirtualRegister(Src) &&
"MachineCopyPropagation should be run after register allocation!");
// The two copies cancel out and the source of the first copy
// hasn't been overridden, eliminate the second one. e.g.
// %ECX<def> = COPY %EAX
// ... nothing clobbered EAX.
// %EAX<def> = COPY %ECX
// =>
// %ECX<def> = COPY %EAX
//
// or
//
// %ECX<def> = COPY %EAX
// ... nothing clobbered EAX.
// %ECX<def> = COPY %EAX
// =>
// %ECX<def> = COPY %EAX
if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
continue;
// If Src is defined by a previous copy, the previous copy cannot be
// eliminated.
- for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
- Reg2MIMap::iterator CI = CopyMap.find(*AI);
- if (CI != CopyMap.end()) {
- DEBUG(dbgs() << "MCP: Copy is no longer dead: "; CI->second->dump());
- MaybeDeadCopies.remove(CI->second);
- }
+ ReadRegister(Src);
+ for (const MachineOperand &MO : MI->implicit_operands()) {
+ if (!MO.isReg() || !MO.readsReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ ReadRegister(Reg);
}
DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
// Copy is now a candidate for deletion.
if (!MRI->isReserved(Def))
MaybeDeadCopies.insert(MI);
// If 'Def' is previously source of another copy, then this earlier copy's
// source is no longer available. e.g.
// %xmm9<def> = copy %xmm2
// ...
// %xmm2<def> = copy %xmm0
// ...
// %xmm2<def> = copy %xmm9
ClobberRegister(Def);
+ for (const MachineOperand &MO : MI->implicit_operands()) {
+ if (!MO.isReg() || !MO.isDef())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (!Reg)
+ continue;
+ ClobberRegister(Reg);
+ }
// Remember Def is defined by the copy.
for (MCSubRegIterator SR(Def, TRI, /*IncludeSelf=*/true); SR.isValid();
++SR) {
CopyMap[*SR] = MI;
AvailCopyMap[*SR] = MI;
}
// Remember source that's copied to Def. Once it's clobbered, then
// it's no longer available for copy propagation.
RegList &DestList = SrcMap[Src];
if (!is_contained(DestList, Def))
DestList.push_back(Def);
continue;
}
// Not a copy.
SmallVector<unsigned, 2> Defs;
const MachineOperand *RegMask = nullptr;
for (const MachineOperand &MO : MI->operands()) {
if (MO.isRegMask())
RegMask = &MO;
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (!Reg)
continue;
assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
"MachineCopyPropagation should be run after register allocation!");
if (MO.isDef()) {
Defs.push_back(Reg);
- continue;
- }
-
- // If 'Reg' is defined by a copy, the copy is no longer a candidate
- // for elimination.
- for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
- Reg2MIMap::iterator CI = CopyMap.find(*AI);
- if (CI != CopyMap.end()) {
- DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
- MaybeDeadCopies.remove(CI->second);
- }
+ } else {
+ ReadRegister(Reg);
}
// Treat undef use like defs for copy propagation but not for
// dead copy. We would need to do a liveness check to be sure the copy
// is dead for undef uses.
// The backends are allowed to do whatever they want with undef value
// and we cannot be sure this register will not be rewritten to break
// some false dependencies for the hardware for instance.
if (MO.isUndef())
Defs.push_back(Reg);
}
// The instruction has a register mask operand which means that it clobbers
// a large set of registers. Treat clobbered registers the same way as
// defined registers.
if (RegMask) {
// Erase any MaybeDeadCopies whose destination register is clobbered.
for (SmallSetVector<MachineInstr *, 8>::iterator DI =
MaybeDeadCopies.begin();
DI != MaybeDeadCopies.end();) {
MachineInstr *MaybeDead = *DI;
unsigned Reg = MaybeDead->getOperand(0).getReg();
assert(!MRI->isReserved(Reg));
if (!RegMask->clobbersPhysReg(Reg)) {
++DI;
continue;
}
DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
MaybeDead->dump());
// erase() will return the next valid iterator pointing to the next
// element after the erased one.
DI = MaybeDeadCopies.erase(DI);
MaybeDead->eraseFromParent();
Changed = true;
++NumDeletes;
}
removeClobberedRegsFromMap(AvailCopyMap, *RegMask);
removeClobberedRegsFromMap(CopyMap, *RegMask);
for (SourceMap::iterator I = SrcMap.begin(), E = SrcMap.end(), Next;
I != E; I = Next) {
Next = std::next(I);
if (RegMask->clobbersPhysReg(I->first)) {
removeRegsFromMap(AvailCopyMap, I->second, *TRI);
SrcMap.erase(I);
}
}
}
// Any previous copy definition or reading the Defs is no longer available.
for (unsigned Reg : Defs)
ClobberRegister(Reg);
}
// If MBB doesn't have successors, delete the copies whose defs are not used.
// If MBB does have successors, then conservative assume the defs are live-out
// since we don't want to trust live-in lists.
if (MBB.succ_empty()) {
for (MachineInstr *MaybeDead : MaybeDeadCopies) {
assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
MaybeDead->eraseFromParent();
Changed = true;
++NumDeletes;
}
}
MaybeDeadCopies.clear();
AvailCopyMap.clear();
CopyMap.clear();
SrcMap.clear();
}
bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
Changed = false;
TRI = MF.getSubtarget().getRegisterInfo();
TII = MF.getSubtarget().getInstrInfo();
MRI = &MF.getRegInfo();
for (MachineBasicBlock &MBB : MF)
CopyPropagateBlock(MBB);
return Changed;
}
Index: projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp (revision 313643)
@@ -1,3186 +1,3206 @@
//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the generic RegisterCoalescer interface which
// is used as the common interface used by all clients and
// implementations of register coalescing.
//
//===----------------------------------------------------------------------===//
#include "RegisterCoalescer.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/LiveRangeEdit.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/RegisterClassInfo.h"
#include "llvm/CodeGen/VirtRegMap.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <cmath>
using namespace llvm;
#define DEBUG_TYPE "regalloc"
STATISTIC(numJoins , "Number of interval joins performed");
STATISTIC(numCrossRCs , "Number of cross class joins performed");
STATISTIC(numCommutes , "Number of instruction commuting performed");
STATISTIC(numExtends , "Number of copies extended");
STATISTIC(NumReMats , "Number of instructions re-materialized");
STATISTIC(NumInflated , "Number of register classes inflated");
STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
STATISTIC(NumLaneResolves, "Number of dead lane conflicts resolved");
static cl::opt<bool>
EnableJoining("join-liveintervals",
cl::desc("Coalesce copies (default=true)"),
cl::init(true));
static cl::opt<bool> UseTerminalRule("terminal-rule",
cl::desc("Apply the terminal rule"),
cl::init(false), cl::Hidden);
/// Temporary flag to test critical edge unsplitting.
static cl::opt<bool>
EnableJoinSplits("join-splitedges",
cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
/// Temporary flag to test global copy optimization.
static cl::opt<cl::boolOrDefault>
EnableGlobalCopies("join-globalcopies",
cl::desc("Coalesce copies that span blocks (default=subtarget)"),
cl::init(cl::BOU_UNSET), cl::Hidden);
static cl::opt<bool>
VerifyCoalescing("verify-coalescing",
cl::desc("Verify machine instrs before and after register coalescing"),
cl::Hidden);
namespace {
class RegisterCoalescer : public MachineFunctionPass,
private LiveRangeEdit::Delegate {
MachineFunction* MF;
MachineRegisterInfo* MRI;
const TargetMachine* TM;
const TargetRegisterInfo* TRI;
const TargetInstrInfo* TII;
LiveIntervals *LIS;
const MachineLoopInfo* Loops;
AliasAnalysis *AA;
RegisterClassInfo RegClassInfo;
/// A LaneMask to remember on which subregister live ranges we need to call
/// shrinkToUses() later.
LaneBitmask ShrinkMask;
/// True if the main range of the currently coalesced intervals should be
/// checked for smaller live intervals.
bool ShrinkMainRange;
/// \brief True if the coalescer should aggressively coalesce global copies
/// in favor of keeping local copies.
bool JoinGlobalCopies;
/// \brief True if the coalescer should aggressively coalesce fall-thru
/// blocks exclusively containing copies.
bool JoinSplitEdges;
/// Copy instructions yet to be coalesced.
SmallVector<MachineInstr*, 8> WorkList;
SmallVector<MachineInstr*, 8> LocalWorkList;
/// Set of instruction pointers that have been erased, and
/// that may be present in WorkList.
SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
/// Dead instructions that are about to be deleted.
SmallVector<MachineInstr*, 8> DeadDefs;
/// Virtual registers to be considered for register class inflation.
SmallVector<unsigned, 8> InflateRegs;
/// Recursively eliminate dead defs in DeadDefs.
void eliminateDeadDefs();
/// LiveRangeEdit callback for eliminateDeadDefs().
void LRE_WillEraseInstruction(MachineInstr *MI) override;
/// Coalesce the LocalWorkList.
void coalesceLocals();
/// Join compatible live intervals
void joinAllIntervals();
/// Coalesce copies in the specified MBB, putting
/// copies that cannot yet be coalesced into WorkList.
void copyCoalesceInMBB(MachineBasicBlock *MBB);
/// Tries to coalesce all copies in CurrList. Returns true if any progress
/// was made.
bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
/// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
/// src/dst of the copy instruction CopyMI. This returns true if the copy
/// was successfully coalesced away. If it is not currently possible to
/// coalesce this interval, but it may be possible if other things get
/// coalesced, then it returns true by reference in 'Again'.
bool joinCopy(MachineInstr *TheCopy, bool &Again);
/// Attempt to join these two intervals. On failure, this
/// returns false. The output "SrcInt" will not have been modified, so we
/// can use this information below to update aliases.
bool joinIntervals(CoalescerPair &CP);
/// Attempt joining two virtual registers. Return true on success.
bool joinVirtRegs(CoalescerPair &CP);
/// Attempt joining with a reserved physreg.
bool joinReservedPhysReg(CoalescerPair &CP);
/// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
/// Subranges in @p LI which only partially interfere with the desired
/// LaneMask are split as necessary. @p LaneMask are the lanes that
/// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
/// lanemasks already adjusted to the coalesced register.
void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
LaneBitmask LaneMask, CoalescerPair &CP);
/// Join the liveranges of two subregisters. Joins @p RRange into
/// @p LRange, @p RRange may be invalid afterwards.
void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
LaneBitmask LaneMask, const CoalescerPair &CP);
/// We found a non-trivially-coalescable copy. If the source value number is
/// defined by a copy from the destination reg see if we can merge these two
/// destination reg valno# into a single value number, eliminating a copy.
/// This returns true if an interval was modified.
bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
/// Return true if there are definitions of IntB
/// other than BValNo val# that can reach uses of AValno val# of IntA.
bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
VNInfo *AValNo, VNInfo *BValNo);
/// We found a non-trivially-coalescable copy.
/// If the source value number is defined by a commutable instruction and
/// its other operand is coalesced to the copy dest register, see if we
/// can transform the copy into a noop by commuting the definition.
/// This returns true if an interval was modified.
bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
/// If the source of a copy is defined by a
/// trivial computation, replace the copy by rematerialize the definition.
bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
bool &IsDefCopy);
/// Return true if a copy involving a physreg should be joined.
bool canJoinPhys(const CoalescerPair &CP);
/// Replace all defs and uses of SrcReg to DstReg and update the subregister
/// number if it is not zero. If DstReg is a physical register and the
/// existing subregister number of the def / use being updated is not zero,
/// make sure to set it to the correct physical subregister.
void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
/// If the given machine operand reads only undefined lanes add an undef
/// flag.
/// This can happen when undef uses were previously concealed by a copy
/// which we coalesced. Example:
/// %vreg0:sub0<def,read-undef> = ...
/// %vreg1 = COPY %vreg0 <-- Coalescing COPY reveals undef
/// = use %vreg1:sub1 <-- hidden undef use
void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
MachineOperand &MO, unsigned SubRegIdx);
/// Handle copies of undef values.
/// Returns true if @p CopyMI was a copy of an undef value and eliminated.
bool eliminateUndefCopy(MachineInstr *CopyMI);
/// Check whether or not we should apply the terminal rule on the
/// destination (Dst) of \p Copy.
/// When the terminal rule applies, Copy is not profitable to
/// coalesce.
/// Dst is terminal if it has exactly one affinity (Dst, Src) and
/// at least one interference (Dst, Dst2). If Dst is terminal, the
/// terminal rule consists in checking that at least one of
/// interfering node, say Dst2, has an affinity of equal or greater
/// weight with Src.
/// In that case, Dst2 and Dst will not be able to be both coalesced
/// with Src. Since Dst2 exposes more coalescing opportunities than
/// Dst, we can drop \p Copy.
bool applyTerminalRule(const MachineInstr &Copy) const;
/// Wrapper method for \see LiveIntervals::shrinkToUses.
/// This method does the proper fixing of the live-ranges when the afore
/// mentioned method returns true.
void shrinkToUses(LiveInterval *LI,
SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
if (LIS->shrinkToUses(LI, Dead)) {
/// Check whether or not \p LI is composed by multiple connected
/// components and if that is the case, fix that.
SmallVector<LiveInterval*, 8> SplitLIs;
LIS->splitSeparateComponents(*LI, SplitLIs);
}
}
public:
static char ID; ///< Class identification, replacement for typeinfo
RegisterCoalescer() : MachineFunctionPass(ID) {
initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override;
void releaseMemory() override;
/// This is the pass entry point.
bool runOnMachineFunction(MachineFunction&) override;
/// Implement the dump method.
void print(raw_ostream &O, const Module* = nullptr) const override;
};
} // end anonymous namespace
char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
"Simple Register Coalescing", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
"Simple Register Coalescing", false, false)
char RegisterCoalescer::ID = 0;
static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
unsigned &Src, unsigned &Dst,
unsigned &SrcSub, unsigned &DstSub) {
if (MI->isCopy()) {
Dst = MI->getOperand(0).getReg();
DstSub = MI->getOperand(0).getSubReg();
Src = MI->getOperand(1).getReg();
SrcSub = MI->getOperand(1).getSubReg();
} else if (MI->isSubregToReg()) {
Dst = MI->getOperand(0).getReg();
DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
MI->getOperand(3).getImm());
Src = MI->getOperand(2).getReg();
SrcSub = MI->getOperand(2).getSubReg();
} else
return false;
return true;
}
/// Return true if this block should be vacated by the coalescer to eliminate
/// branches. The important cases to handle in the coalescer are critical edges
/// split during phi elimination which contain only copies. Simple blocks that
/// contain non-branches should also be vacated, but this can be handled by an
/// earlier pass similar to early if-conversion.
static bool isSplitEdge(const MachineBasicBlock *MBB) {
if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
return false;
for (const auto &MI : *MBB) {
if (!MI.isCopyLike() && !MI.isUnconditionalBranch())
return false;
}
return true;
}
bool CoalescerPair::setRegisters(const MachineInstr *MI) {
SrcReg = DstReg = 0;
SrcIdx = DstIdx = 0;
NewRC = nullptr;
Flipped = CrossClass = false;
unsigned Src, Dst, SrcSub, DstSub;
if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
return false;
Partial = SrcSub || DstSub;
// If one register is a physreg, it must be Dst.
if (TargetRegisterInfo::isPhysicalRegister(Src)) {
if (TargetRegisterInfo::isPhysicalRegister(Dst))
return false;
std::swap(Src, Dst);
std::swap(SrcSub, DstSub);
Flipped = true;
}
const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
// Eliminate DstSub on a physreg.
if (DstSub) {
Dst = TRI.getSubReg(Dst, DstSub);
if (!Dst) return false;
DstSub = 0;
}
// Eliminate SrcSub by picking a corresponding Dst superregister.
if (SrcSub) {
Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
if (!Dst) return false;
} else if (!MRI.getRegClass(Src)->contains(Dst)) {
return false;
}
} else {
// Both registers are virtual.
const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
// Both registers have subreg indices.
if (SrcSub && DstSub) {
// Copies between different sub-registers are never coalescable.
if (Src == Dst && SrcSub != DstSub)
return false;
NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
SrcIdx, DstIdx);
if (!NewRC)
return false;
} else if (DstSub) {
// SrcReg will be merged with a sub-register of DstReg.
SrcIdx = DstSub;
NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
} else if (SrcSub) {
// DstReg will be merged with a sub-register of SrcReg.
DstIdx = SrcSub;
NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
} else {
// This is a straight copy without sub-registers.
NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
}
// The combined constraint may be impossible to satisfy.
if (!NewRC)
return false;
// Prefer SrcReg to be a sub-register of DstReg.
// FIXME: Coalescer should support subregs symmetrically.
if (DstIdx && !SrcIdx) {
std::swap(Src, Dst);
std::swap(SrcIdx, DstIdx);
Flipped = !Flipped;
}
CrossClass = NewRC != DstRC || NewRC != SrcRC;
}
// Check our invariants
assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
"Cannot have a physical SubIdx");
SrcReg = Src;
DstReg = Dst;
return true;
}
bool CoalescerPair::flip() {
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
return false;
std::swap(SrcReg, DstReg);
std::swap(SrcIdx, DstIdx);
Flipped = !Flipped;
return true;
}
bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
if (!MI)
return false;
unsigned Src, Dst, SrcSub, DstSub;
if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
return false;
// Find the virtual register that is SrcReg.
if (Dst == SrcReg) {
std::swap(Src, Dst);
std::swap(SrcSub, DstSub);
} else if (Src != SrcReg) {
return false;
}
// Now check that Dst matches DstReg.
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
if (!TargetRegisterInfo::isPhysicalRegister(Dst))
return false;
assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
// DstSub could be set for a physreg from INSERT_SUBREG.
if (DstSub)
Dst = TRI.getSubReg(Dst, DstSub);
// Full copy of Src.
if (!SrcSub)
return DstReg == Dst;
// This is a partial register copy. Check that the parts match.
return TRI.getSubReg(DstReg, SrcSub) == Dst;
} else {
// DstReg is virtual.
if (DstReg != Dst)
return false;
// Registers match, do the subregisters line up?
return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
TRI.composeSubRegIndices(DstIdx, DstSub);
}
}
void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesCFG();
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<LiveIntervals>();
AU.addPreserved<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addRequired<MachineLoopInfo>();
AU.addPreserved<MachineLoopInfo>();
AU.addPreservedID(MachineDominatorsID);
MachineFunctionPass::getAnalysisUsage(AU);
}
void RegisterCoalescer::eliminateDeadDefs() {
SmallVector<unsigned, 8> NewRegs;
LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
nullptr, this).eliminateDeadDefs(DeadDefs);
}
void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
// MI may be in WorkList. Make sure we don't visit it.
ErasedInstrs.insert(MI);
}
bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
MachineInstr *CopyMI) {
assert(!CP.isPartial() && "This doesn't work for partial copies.");
assert(!CP.isPhys() && "This doesn't work for physreg copies.");
LiveInterval &IntA =
LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
LiveInterval &IntB =
LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
// We have a non-trivially-coalescable copy with IntA being the source and
// IntB being the dest, thus this defines a value number in IntB. If the
// source value number (in IntA) is defined by a copy from B, see if we can
// merge these two pieces of B into a single value number, eliminating a copy.
// For example:
//
// A3 = B0
// ...
// B1 = A3 <- this copy
//
// In this case, B0 can be extended to where the B1 copy lives, allowing the
// B1 value number to be replaced with B0 (which simplifies the B
// liveinterval).
// BValNo is a value number in B that is defined by a copy from A. 'B1' in
// the example above.
LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
if (BS == IntB.end()) return false;
VNInfo *BValNo = BS->valno;
// Get the location that B is defined at. Two options: either this value has
// an unknown definition point or it is defined at CopyIdx. If unknown, we
// can't process it.
if (BValNo->def != CopyIdx) return false;
// AValNo is the value number in A that defines the copy, A3 in the example.
SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
// The live segment might not exist after fun with physreg coalescing.
if (AS == IntA.end()) return false;
VNInfo *AValNo = AS->valno;
// If AValNo is defined as a copy from IntB, we can potentially process this.
// Get the instruction that defines this value number.
MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
// Don't allow any partial copies, even if isCoalescable() allows them.
if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
return false;
// Get the Segment in IntB that this value number starts with.
LiveInterval::iterator ValS =
IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
if (ValS == IntB.end())
return false;
// Make sure that the end of the live segment is inside the same block as
// CopyMI.
MachineInstr *ValSEndInst =
LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
return false;
// Okay, we now know that ValS ends in the same block that the CopyMI
// live-range starts. If there are no intervening live segments between them
// in IntB, we can merge them.
if (ValS+1 != BS) return false;
DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
// We are about to delete CopyMI, so need to remove it as the 'instruction
// that defines this value #'. Update the valnum with the new defining
// instruction #.
BValNo->def = FillerStart;
// Okay, we can merge them. We need to insert a new liverange:
// [ValS.end, BS.begin) of either value number, then we merge the
// two value numbers.
IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
// Okay, merge "B1" into the same value number as "B0".
if (BValNo != ValS->valno)
IntB.MergeValueNumberInto(BValNo, ValS->valno);
// Do the same for the subregister segments.
for (LiveInterval::SubRange &S : IntB.subranges()) {
VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
if (SubBValNo != SubValSNo)
S.MergeValueNumberInto(SubBValNo, SubValSNo);
}
DEBUG(dbgs() << " result = " << IntB << '\n');
// If the source instruction was killing the source register before the
// merge, unset the isKill marker given the live range has been extended.
int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
if (UIdx != -1) {
ValSEndInst->getOperand(UIdx).setIsKill(false);
}
// Rewrite the copy. If the copy instruction was killing the destination
// register before the merge, find the last use and trim the live range. That
// will also add the isKill marker.
CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
if (AS->end == CopyIdx)
shrinkToUses(&IntA);
++numExtends;
return true;
}
bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
LiveInterval &IntB,
VNInfo *AValNo,
VNInfo *BValNo) {
// If AValNo has PHI kills, conservatively assume that IntB defs can reach
// the PHI values.
if (LIS->hasPHIKill(IntA, AValNo))
return true;
for (LiveRange::Segment &ASeg : IntA.segments) {
if (ASeg.valno != AValNo) continue;
LiveInterval::iterator BI =
std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
if (BI != IntB.begin())
--BI;
for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
if (BI->valno == BValNo)
continue;
if (BI->start <= ASeg.start && BI->end > ASeg.start)
return true;
if (BI->start > ASeg.start && BI->start < ASeg.end)
return true;
}
}
return false;
}
/// Copy segements with value number @p SrcValNo from liverange @p Src to live
/// range @Dst and use value number @p DstValNo there.
static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
const LiveRange &Src, const VNInfo *SrcValNo)
{
for (const LiveRange::Segment &S : Src.segments) {
if (S.valno != SrcValNo)
continue;
Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
}
}
bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
MachineInstr *CopyMI) {
assert(!CP.isPhys());
LiveInterval &IntA =
LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
LiveInterval &IntB =
LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
// We found a non-trivially-coalescable copy with IntA being the source and
// IntB being the dest, thus this defines a value number in IntB. If the
// source value number (in IntA) is defined by a commutable instruction and
// its other operand is coalesced to the copy dest register, see if we can
// transform the copy into a noop by commuting the definition. For example,
//
// A3 = op A2 B0<kill>
// ...
// B1 = A3 <- this copy
// ...
// = op A3 <- more uses
//
// ==>
//
// B2 = op B0 A2<kill>
// ...
// B1 = B2 <- now an identity copy
// ...
// = op B2 <- more uses
// BValNo is a value number in B that is defined by a copy from A. 'B1' in
// the example above.
SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
assert(BValNo != nullptr && BValNo->def == CopyIdx);
// AValNo is the value number in A that defines the copy, A3 in the example.
VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
assert(AValNo && !AValNo->isUnused() && "COPY source not live");
if (AValNo->isPHIDef())
return false;
MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
if (!DefMI)
return false;
if (!DefMI->isCommutable())
return false;
// If DefMI is a two-address instruction then commuting it will change the
// destination register.
int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
assert(DefIdx != -1);
unsigned UseOpIdx;
if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
return false;
// FIXME: The code below tries to commute 'UseOpIdx' operand with some other
// commutable operand which is expressed by 'CommuteAnyOperandIndex'value
// passed to the method. That _other_ operand is chosen by
// the findCommutedOpIndices() method.
//
// That is obviously an area for improvement in case of instructions having
// more than 2 operands. For example, if some instruction has 3 commutable
// operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
// op#2<->op#3) of commute transformation should be considered/tried here.
unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
return false;
MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
unsigned NewReg = NewDstMO.getReg();
if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
return false;
// Make sure there are no other definitions of IntB that would reach the
// uses which the new definition can reach.
if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
return false;
// If some of the uses of IntA.reg is already coalesced away, return false.
// It's not possible to determine whether it's safe to perform the coalescing.
for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) {
MachineInstr *UseMI = MO.getParent();
unsigned OpNo = &MO - &UseMI->getOperand(0);
SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
if (US == IntA.end() || US->valno != AValNo)
continue;
// If this use is tied to a def, we can't rewrite the register.
if (UseMI->isRegTiedToDefOperand(OpNo))
return false;
}
DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
<< *DefMI);
// At this point we have decided that it is legal to do this
// transformation. Start by commuting the instruction.
MachineBasicBlock *MBB = DefMI->getParent();
MachineInstr *NewMI =
TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
if (!NewMI)
return false;
if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
!MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
return false;
if (NewMI != DefMI) {
LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
MachineBasicBlock::iterator Pos = DefMI;
MBB->insert(Pos, NewMI);
MBB->erase(DefMI);
}
// If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
// A = or A, B
// ...
// B = A
// ...
// C = A<kill>
// ...
// = B
// Update uses of IntA of the specific Val# with IntB.
for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
UE = MRI->use_end();
UI != UE; /* ++UI is below because of possible MI removal */) {
MachineOperand &UseMO = *UI;
++UI;
if (UseMO.isUndef())
continue;
MachineInstr *UseMI = UseMO.getParent();
if (UseMI->isDebugValue()) {
// FIXME These don't have an instruction index. Not clear we have enough
// info to decide whether to do this replacement or not. For now do it.
UseMO.setReg(NewReg);
continue;
}
SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(true);
LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
assert(US != IntA.end() && "Use must be live");
if (US->valno != AValNo)
continue;
// Kill flags are no longer accurate. They are recomputed after RA.
UseMO.setIsKill(false);
if (TargetRegisterInfo::isPhysicalRegister(NewReg))
UseMO.substPhysReg(NewReg, *TRI);
else
UseMO.setReg(NewReg);
if (UseMI == CopyMI)
continue;
if (!UseMI->isCopy())
continue;
if (UseMI->getOperand(0).getReg() != IntB.reg ||
UseMI->getOperand(0).getSubReg())
continue;
// This copy will become a noop. If it's defining a new val#, merge it into
// BValNo.
SlotIndex DefIdx = UseIdx.getRegSlot();
VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
if (!DVNI)
continue;
DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
assert(DVNI->def == DefIdx);
BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
for (LiveInterval::SubRange &S : IntB.subranges()) {
VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
if (!SubDVNI)
continue;
VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
assert(SubBValNo->def == CopyIdx);
S.MergeValueNumberInto(SubDVNI, SubBValNo);
}
ErasedInstrs.insert(UseMI);
LIS->RemoveMachineInstrFromMaps(*UseMI);
UseMI->eraseFromParent();
}
// Extend BValNo by merging in IntA live segments of AValNo. Val# definition
// is updated.
BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
if (IntB.hasSubRanges()) {
if (!IntA.hasSubRanges()) {
LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
IntA.createSubRangeFrom(Allocator, Mask, IntA);
}
SlotIndex AIdx = CopyIdx.getRegSlot(true);
for (LiveInterval::SubRange &SA : IntA.subranges()) {
VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
assert(ASubValNo != nullptr);
LaneBitmask AMask = SA.LaneMask;
for (LiveInterval::SubRange &SB : IntB.subranges()) {
LaneBitmask BMask = SB.LaneMask;
LaneBitmask Common = BMask & AMask;
if (Common.none())
continue;
DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
<< " into " << PrintLaneMask(Common) << '\n');
LaneBitmask BRest = BMask & ~AMask;
LiveInterval::SubRange *CommonRange;
if (BRest.any()) {
SB.LaneMask = BRest;
DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
<< '\n');
// Duplicate SubRange for newly merged common stuff.
CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
} else {
// We van reuse the L SubRange.
SB.LaneMask = Common;
CommonRange = &SB;
}
LiveRange RangeCopy(SB, Allocator);
VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
assert(BSubValNo->def == CopyIdx);
BSubValNo->def = ASubValNo->def;
addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
AMask &= ~BMask;
}
if (AMask.any()) {
DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
}
}
}
BValNo->def = AValNo->def;
addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
LIS->removeVRegDefAt(IntA, AValNo->def);
DEBUG(dbgs() << "\t\ttrimmed: " << IntA << '\n');
++numCommutes;
return true;
}
/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
/// defining a subregister.
static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
"This code cannot handle physreg aliasing");
for (const MachineOperand &Op : MI.operands()) {
if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
continue;
// Return true if we define the full register or don't care about the value
// inside other subregisters.
if (Op.getSubReg() == 0 || Op.isUndef())
return true;
}
return false;
}
bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
MachineInstr *CopyMI,
bool &IsDefCopy) {
IsDefCopy = false;
unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
return false;
LiveInterval &SrcInt = LIS->getInterval(SrcReg);
SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
assert(ValNo && "CopyMI input register not live");
if (ValNo->isPHIDef() || ValNo->isUnused())
return false;
MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
if (!DefMI)
return false;
if (DefMI->isCopyLike()) {
IsDefCopy = true;
return false;
}
if (!TII->isAsCheapAsAMove(*DefMI))
return false;
if (!TII->isTriviallyReMaterializable(*DefMI, AA))
return false;
if (!definesFullReg(*DefMI, SrcReg))
return false;
bool SawStore = false;
if (!DefMI->isSafeToMove(AA, SawStore))
return false;
const MCInstrDesc &MCID = DefMI->getDesc();
if (MCID.getNumDefs() != 1)
return false;
// Only support subregister destinations when the def is read-undef.
MachineOperand &DstOperand = CopyMI->getOperand(0);
unsigned CopyDstReg = DstOperand.getReg();
if (DstOperand.getSubReg() && !DstOperand.isUndef())
return false;
// If both SrcIdx and DstIdx are set, correct rematerialization would widen
// the register substantially (beyond both source and dest size). This is bad
// for performance since it can cascade through a function, introducing many
// extra spills and fills (e.g. ARM can easily end up copying QQQQPR registers
// around after a few subreg copies).
if (SrcIdx && DstIdx)
return false;
const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
if (!DefMI->isImplicitDef()) {
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
unsigned NewDstReg = DstReg;
unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
DefMI->getOperand(0).getSubReg());
if (NewDstIdx)
NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
// Finally, make sure that the physical subregister that will be
// constructed later is permitted for the instruction.
if (!DefRC->contains(NewDstReg))
return false;
} else {
// Theoretically, some stack frame reference could exist. Just make sure
// it hasn't actually happened.
assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
"Only expect to deal with virtual or physical registers");
}
}
DebugLoc DL = CopyMI->getDebugLoc();
MachineBasicBlock *MBB = CopyMI->getParent();
MachineBasicBlock::iterator MII =
std::next(MachineBasicBlock::iterator(CopyMI));
TII->reMaterialize(*MBB, MII, DstReg, SrcIdx, *DefMI, *TRI);
MachineInstr &NewMI = *std::prev(MII);
NewMI.setDebugLoc(DL);
// In a situation like the following:
// %vreg0:subreg = instr ; DefMI, subreg = DstIdx
// %vreg1 = copy %vreg0:subreg ; CopyMI, SrcIdx = 0
// instead of widening %vreg1 to the register class of %vreg0 simply do:
// %vreg1 = instr
const TargetRegisterClass *NewRC = CP.getNewRC();
if (DstIdx != 0) {
MachineOperand &DefMO = NewMI.getOperand(0);
if (DefMO.getSubReg() == DstIdx) {
assert(SrcIdx == 0 && CP.isFlipped()
&& "Shouldn't have SrcIdx+DstIdx at this point");
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
const TargetRegisterClass *CommonRC =
TRI->getCommonSubClass(DefRC, DstRC);
if (CommonRC != nullptr) {
NewRC = CommonRC;
DstIdx = 0;
DefMO.setSubReg(0);
DefMO.setIsUndef(false); // Only subregs can have def+undef.
}
}
}
// CopyMI may have implicit operands, save them so that we can transfer them
// over to the newly materialized instruction after CopyMI is removed.
SmallVector<MachineOperand, 4> ImplicitOps;
ImplicitOps.reserve(CopyMI->getNumOperands() -
CopyMI->getDesc().getNumOperands());
for (unsigned I = CopyMI->getDesc().getNumOperands(),
E = CopyMI->getNumOperands();
I != E; ++I) {
MachineOperand &MO = CopyMI->getOperand(I);
if (MO.isReg()) {
assert(MO.isImplicit() && "No explicit operands after implict operands.");
// Discard VReg implicit defs.
if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
ImplicitOps.push_back(MO);
}
}
LIS->ReplaceMachineInstrInMaps(*CopyMI, NewMI);
CopyMI->eraseFromParent();
ErasedInstrs.insert(CopyMI);
// NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
// We need to remember these so we can add intervals once we insert
// NewMI into SlotIndexes.
SmallVector<unsigned, 4> NewMIImplDefs;
for (unsigned i = NewMI.getDesc().getNumOperands(),
e = NewMI.getNumOperands();
i != e; ++i) {
MachineOperand &MO = NewMI.getOperand(i);
if (MO.isReg() && MO.isDef()) {
assert(MO.isImplicit() && MO.isDead() &&
TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
NewMIImplDefs.push_back(MO.getReg());
}
}
if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
unsigned NewIdx = NewMI.getOperand(0).getSubReg();
if (DefRC != nullptr) {
if (NewIdx)
NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
else
NewRC = TRI->getCommonSubClass(NewRC, DefRC);
assert(NewRC && "subreg chosen for remat incompatible with instruction");
}
// Remap subranges to new lanemask and change register class.
LiveInterval &DstInt = LIS->getInterval(DstReg);
for (LiveInterval::SubRange &SR : DstInt.subranges()) {
SR.LaneMask = TRI->composeSubRegIndexLaneMask(DstIdx, SR.LaneMask);
}
MRI->setRegClass(DstReg, NewRC);
// Update machine operands and add flags.
updateRegDefsUses(DstReg, DstReg, DstIdx);
NewMI.getOperand(0).setSubReg(NewIdx);
// Add dead subregister definitions if we are defining the whole register
// but only part of it is live.
// This could happen if the rematerialization instruction is rematerializing
// more than actually is used in the register.
// An example would be:
// vreg1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
// ; Copying only part of the register here, but the rest is undef.
// vreg2:sub_16bit<def, read-undef> = COPY vreg1:sub_16bit
// ==>
// ; Materialize all the constants but only using one
// vreg2 = LOAD_CONSTANTS 5, 8
//
// at this point for the part that wasn't defined before we could have
// subranges missing the definition.
if (NewIdx == 0 && DstInt.hasSubRanges()) {
SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
SlotIndex DefIndex =
CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
for (LiveInterval::SubRange &SR : DstInt.subranges()) {
if (!SR.liveAt(DefIndex))
SR.createDeadDef(DefIndex, Alloc);
MaxMask &= ~SR.LaneMask;
}
if (MaxMask.any()) {
LiveInterval::SubRange *SR = DstInt.createSubRange(Alloc, MaxMask);
SR->createDeadDef(DefIndex, Alloc);
}
}
} else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
// The New instruction may be defining a sub-register of what's actually
// been asked for. If so it must implicitly define the whole thing.
assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
"Only expect virtual or physical registers in remat");
NewMI.getOperand(0).setIsDead(true);
NewMI.addOperand(MachineOperand::CreateReg(
CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
// Record small dead def live-ranges for all the subregisters
// of the destination register.
// Otherwise, variables that live through may miss some
// interferences, thus creating invalid allocation.
// E.g., i386 code:
// vreg1 = somedef ; vreg1 GR8
// vreg2 = remat ; vreg2 GR32
// CL = COPY vreg2.sub_8bit
// = somedef vreg1 ; vreg1 GR8
// =>
// vreg1 = somedef ; vreg1 GR8
// ECX<def, dead> = remat ; CL<imp-def>
// = somedef vreg1 ; vreg1 GR8
// vreg1 will see the inteferences with CL but not with CH since
// no live-ranges would have been created for ECX.
// Fix that!
SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
for (MCRegUnitIterator Units(NewMI.getOperand(0).getReg(), TRI);
Units.isValid(); ++Units)
if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
}
if (NewMI.getOperand(0).getSubReg())
NewMI.getOperand(0).setIsUndef();
// Transfer over implicit operands to the rematerialized instruction.
for (MachineOperand &MO : ImplicitOps)
NewMI.addOperand(MO);
SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
unsigned Reg = NewMIImplDefs[i];
for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
}
DEBUG(dbgs() << "Remat: " << NewMI);
++NumReMats;
// The source interval can become smaller because we removed a use.
shrinkToUses(&SrcInt, &DeadDefs);
if (!DeadDefs.empty()) {
// If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
// to describe DstReg instead.
for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
MachineInstr *UseMI = UseMO.getParent();
if (UseMI->isDebugValue()) {
UseMO.setReg(DstReg);
DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
}
}
eliminateDeadDefs();
}
return true;
}
bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
// ProcessImpicitDefs may leave some copies of <undef> values, it only removes
// local variables. When we have a copy like:
//
// %vreg1 = COPY %vreg2<undef>
//
// We delete the copy and remove the corresponding value number from %vreg1.
// Any uses of that value number are marked as <undef>.
// Note that we do not query CoalescerPair here but redo isMoveInstr as the
// CoalescerPair may have a new register class with adjusted subreg indices
// at this point.
unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
// CopyMI is undef iff SrcReg is not live before the instruction.
if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
if ((SR.LaneMask & SrcMask).none())
continue;
if (SR.liveAt(Idx))
return false;
}
} else if (SrcLI.liveAt(Idx))
return false;
DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
// Remove any DstReg segments starting at the instruction.
LiveInterval &DstLI = LIS->getInterval(DstReg);
SlotIndex RegIndex = Idx.getRegSlot();
// Remove value or merge with previous one in case of a subregister def.
if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
DstLI.MergeValueNumberInto(VNI, PrevVNI);
// The affected subregister segments can be removed.
LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
for (LiveInterval::SubRange &SR : DstLI.subranges()) {
if ((SR.LaneMask & DstMask).none())
continue;
VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
SR.removeValNo(SVNI);
}
DstLI.removeEmptySubRanges();
} else
LIS->removeVRegDefAt(DstLI, RegIndex);
// Mark uses as undef.
for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
if (MO.isDef() /*|| MO.isUndef()*/)
continue;
const MachineInstr &MI = *MO.getParent();
SlotIndex UseIdx = LIS->getInstructionIndex(MI);
LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
bool isLive;
if (!UseMask.all() && DstLI.hasSubRanges()) {
isLive = false;
for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
if ((SR.LaneMask & UseMask).none())
continue;
if (SR.liveAt(UseIdx)) {
isLive = true;
break;
}
}
} else
isLive = DstLI.liveAt(UseIdx);
if (isLive)
continue;
MO.setIsUndef(true);
DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
}
// A def of a subregister may be a use of the other subregisters, so
// deleting a def of a subregister may also remove uses. Since CopyMI
// is still part of the function (but about to be erased), mark all
// defs of DstReg in it as <undef>, so that shrinkToUses would
// ignore them.
for (MachineOperand &MO : CopyMI->operands())
if (MO.isReg() && MO.isDef() && MO.getReg() == DstReg)
MO.setIsUndef(true);
LIS->shrinkToUses(&DstLI);
return true;
}
void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
MachineOperand &MO, unsigned SubRegIdx) {
LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubRegIdx);
if (MO.isDef())
Mask = ~Mask;
bool IsUndef = true;
for (const LiveInterval::SubRange &S : Int.subranges()) {
if ((S.LaneMask & Mask).none())
continue;
if (S.liveAt(UseIdx)) {
IsUndef = false;
break;
}
}
if (IsUndef) {
MO.setIsUndef(true);
// We found out some subregister use is actually reading an undefined
// value. In some cases the whole vreg has become undefined at this
// point so we have to potentially shrink the main range if the
// use was ending a live segment there.
LiveQueryResult Q = Int.Query(UseIdx);
if (Q.valueOut() == nullptr)
ShrinkMainRange = true;
}
}
void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
unsigned DstReg,
unsigned SubIdx) {
bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
unsigned SubReg = MO.getSubReg();
if (SubReg == 0 || MO.isUndef())
continue;
MachineInstr &MI = *MO.getParent();
if (MI.isDebugValue())
continue;
SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(true);
addUndefFlag(*DstInt, UseIdx, MO, SubReg);
}
}
SmallPtrSet<MachineInstr*, 8> Visited;
for (MachineRegisterInfo::reg_instr_iterator
I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
I != E; ) {
MachineInstr *UseMI = &*(I++);
// Each instruction can only be rewritten once because sub-register
// composition is not always idempotent. When SrcReg != DstReg, rewriting
// the UseMI operands removes them from the SrcReg use-def chain, but when
// SrcReg is DstReg we could encounter UseMI twice if it has multiple
// operands mentioning the virtual register.
if (SrcReg == DstReg && !Visited.insert(UseMI).second)
continue;
SmallVector<unsigned,8> Ops;
bool Reads, Writes;
std::tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
// If SrcReg wasn't read, it may still be the case that DstReg is live-in
// because SrcReg is a sub-register.
if (DstInt && !Reads && SubIdx)
Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
// Replace SrcReg with DstReg in all UseMI operands.
for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
MachineOperand &MO = UseMI->getOperand(Ops[i]);
// Adjust <undef> flags in case of sub-register joins. We don't want to
// turn a full def into a read-modify-write sub-register def and vice
// versa.
if (SubIdx && MO.isDef())
MO.setIsUndef(!Reads);
// A subreg use of a partially undef (super) register may be a complete
// undef use now and then has to be marked that way.
if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) {
if (!DstInt->hasSubRanges()) {
BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
}
SlotIndex MIIdx = UseMI->isDebugValue()
? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
: LIS->getInstructionIndex(*UseMI);
SlotIndex UseIdx = MIIdx.getRegSlot(true);
addUndefFlag(*DstInt, UseIdx, MO, SubIdx);
}
if (DstIsPhys)
MO.substPhysReg(DstReg, *TRI);
else
MO.substVirtReg(DstReg, SubIdx, *TRI);
}
DEBUG({
dbgs() << "\t\tupdated: ";
if (!UseMI->isDebugValue())
dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
dbgs() << *UseMI;
});
}
}
bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
// Always join simple intervals that are defined by a single copy from a
// reserved register. This doesn't increase register pressure, so it is
// always beneficial.
if (!MRI->isReserved(CP.getDstReg())) {
DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
return false;
}
LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
if (JoinVInt.containsOneValue())
return true;
DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
return false;
}
bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
Again = false;
DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
CoalescerPair CP(*TRI);
if (!CP.setRegisters(CopyMI)) {
DEBUG(dbgs() << "\tNot coalescable.\n");
return false;
}
if (CP.getNewRC()) {
auto SrcRC = MRI->getRegClass(CP.getSrcReg());
auto DstRC = MRI->getRegClass(CP.getDstReg());
unsigned SrcIdx = CP.getSrcIdx();
unsigned DstIdx = CP.getDstIdx();
if (CP.isFlipped()) {
std::swap(SrcIdx, DstIdx);
std::swap(SrcRC, DstRC);
}
if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
CP.getNewRC())) {
DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
return false;
}
}
// Dead code elimination. This really should be handled by MachineDCE, but
// sometimes dead copies slip through, and we can't generate invalid live
// ranges.
if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
DEBUG(dbgs() << "\tCopy is dead.\n");
DeadDefs.push_back(CopyMI);
eliminateDeadDefs();
return true;
}
// Eliminate undefs.
if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
LIS->RemoveMachineInstrFromMaps(*CopyMI);
CopyMI->eraseFromParent();
return false; // Not coalescable.
}
// Coalesced copies are normally removed immediately, but transformations
// like removeCopyByCommutingDef() can inadvertently create identity copies.
// When that happens, just join the values and remove the copy.
if (CP.getSrcReg() == CP.getDstReg()) {
LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
LiveQueryResult LRQ = LI.Query(CopyIdx);
if (VNInfo *DefVNI = LRQ.valueDefined()) {
VNInfo *ReadVNI = LRQ.valueIn();
assert(ReadVNI && "No value before copy and no <undef> flag.");
assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
LI.MergeValueNumberInto(DefVNI, ReadVNI);
// Process subregister liveranges.
for (LiveInterval::SubRange &S : LI.subranges()) {
LiveQueryResult SLRQ = S.Query(CopyIdx);
if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
VNInfo *SReadVNI = SLRQ.valueIn();
S.MergeValueNumberInto(SDefVNI, SReadVNI);
}
}
DEBUG(dbgs() << "\tMerged values: " << LI << '\n');
}
LIS->RemoveMachineInstrFromMaps(*CopyMI);
CopyMI->eraseFromParent();
return true;
}
// Enforce policies.
if (CP.isPhys()) {
DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
<< " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
<< '\n');
if (!canJoinPhys(CP)) {
// Before giving up coalescing, if definition of source is defined by
// trivial computation, try rematerializing it.
bool IsDefCopy;
if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
return true;
if (IsDefCopy)
Again = true; // May be possible to coalesce later.
return false;
}
} else {
// When possible, let DstReg be the larger interval.
if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
LIS->getInterval(CP.getDstReg()).size())
CP.flip();
DEBUG({
dbgs() << "\tConsidering merging to "
<< TRI->getRegClassName(CP.getNewRC()) << " with ";
if (CP.getDstIdx() && CP.getSrcIdx())
dbgs() << PrintReg(CP.getDstReg()) << " in "
<< TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
<< PrintReg(CP.getSrcReg()) << " in "
<< TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
else
dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
<< PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
});
}
ShrinkMask = LaneBitmask::getNone();
ShrinkMainRange = false;
// Okay, attempt to join these two intervals. On failure, this returns false.
// Otherwise, if one of the intervals being joined is a physreg, this method
// always canonicalizes DstInt to be it. The output "SrcInt" will not have
// been modified, so we can use this information below to update aliases.
if (!joinIntervals(CP)) {
// Coalescing failed.
// If definition of source is defined by trivial computation, try
// rematerializing it.
bool IsDefCopy;
if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
return true;
// If we can eliminate the copy without merging the live segments, do so
// now.
if (!CP.isPartial() && !CP.isPhys()) {
if (adjustCopiesBackFrom(CP, CopyMI) ||
removeCopyByCommutingDef(CP, CopyMI)) {
LIS->RemoveMachineInstrFromMaps(*CopyMI);
CopyMI->eraseFromParent();
DEBUG(dbgs() << "\tTrivial!\n");
return true;
}
}
// Otherwise, we are unable to join the intervals.
DEBUG(dbgs() << "\tInterference!\n");
Again = true; // May be possible to coalesce later.
return false;
}
// Coalescing to a virtual register that is of a sub-register class of the
// other. Make sure the resulting register is set to the right register class.
if (CP.isCrossClass()) {
++numCrossRCs;
MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
}
// Removing sub-register copies can ease the register class constraints.
// Make sure we attempt to inflate the register class of DstReg.
if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
InflateRegs.push_back(CP.getDstReg());
// CopyMI has been erased by joinIntervals at this point. Remove it from
// ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
// to the work list. This keeps ErasedInstrs from growing needlessly.
ErasedInstrs.erase(CopyMI);
// Rewrite all SrcReg operands to DstReg.
// Also update DstReg operands to include DstIdx if it is set.
if (CP.getDstIdx())
updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
// Shrink subregister ranges if necessary.
if (ShrinkMask.any()) {
LiveInterval &LI = LIS->getInterval(CP.getDstReg());
for (LiveInterval::SubRange &S : LI.subranges()) {
if ((S.LaneMask & ShrinkMask).none())
continue;
DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
<< ")\n");
LIS->shrinkToUses(S, LI.reg);
}
LI.removeEmptySubRanges();
}
if (ShrinkMainRange) {
LiveInterval &LI = LIS->getInterval(CP.getDstReg());
shrinkToUses(&LI);
}
// SrcReg is guaranteed to be the register whose live interval that is
// being merged.
LIS->removeInterval(CP.getSrcReg());
// Update regalloc hint.
TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
DEBUG({
dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
<< " -> " << PrintReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
dbgs() << "\tResult = ";
if (CP.isPhys())
dbgs() << PrintReg(CP.getDstReg(), TRI);
else
dbgs() << LIS->getInterval(CP.getDstReg());
dbgs() << '\n';
});
++numJoins;
return true;
}
bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
unsigned DstReg = CP.getDstReg();
+ unsigned SrcReg = CP.getSrcReg();
assert(CP.isPhys() && "Must be a physreg copy");
assert(MRI->isReserved(DstReg) && "Not a reserved register");
- LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+ LiveInterval &RHS = LIS->getInterval(SrcReg);
DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
assert(RHS.containsOneValue() && "Invalid join with reserved register");
// Optimization for reserved registers like ESP. We can only merge with a
// reserved physreg if RHS has a single value that is a copy of DstReg.
// The live range of the reserved register will look like a set of dead defs
// - we don't properly track the live range of reserved registers.
// Deny any overlapping intervals. This depends on all the reserved
// register live ranges to look like dead defs.
if (!MRI->isConstantPhysReg(DstReg)) {
for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
// Abort if not all the regunits are reserved.
for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) {
if (!MRI->isReserved(*RI))
return false;
}
if (RHS.overlaps(LIS->getRegUnit(*UI))) {
DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
return false;
}
}
}
// Skip any value computations, we are not adding new values to the
// reserved register. Also skip merging the live ranges, the reserved
// register live range doesn't need to be accurate as long as all the
// defs are there.
// Delete the identity copy.
MachineInstr *CopyMI;
if (CP.isFlipped()) {
- CopyMI = MRI->getVRegDef(RHS.reg);
+ // Physreg is copied into vreg
+ // %vregY = COPY %X
+ // ... //< no other def of %X here
+ // use %vregY
+ // =>
+ // ...
+ // use %X
+ CopyMI = MRI->getVRegDef(SrcReg);
} else {
- if (!MRI->hasOneNonDBGUse(RHS.reg)) {
+ // VReg is copied into physreg:
+ // %vregX = def
+ // ... //< no other def or use of %Y here
+ // %Y = COPY %vregX
+ // =>
+ // %Y = def
+ // ...
+ if (!MRI->hasOneNonDBGUse(SrcReg)) {
DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
return false;
}
- MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
- CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
- const SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
- const SlotIndex DestRegIdx = LIS->getInstructionIndex(*DestMI).getRegSlot();
+ if (!LIS->intervalIsInOneMBB(RHS)) {
+ DEBUG(dbgs() << "\t\tComplex control flow!\n");
+ return false;
+ }
+ MachineInstr &DestMI = *MRI->getVRegDef(SrcReg);
+ CopyMI = &*MRI->use_instr_nodbg_begin(SrcReg);
+ SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+ SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
+
if (!MRI->isConstantPhysReg(DstReg)) {
// We checked above that there are no interfering defs of the physical
// register. However, for this case, where we intent to move up the def of
// the physical register, we also need to check for interfering uses.
SlotIndexes *Indexes = LIS->getSlotIndexes();
for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
MachineInstr *MI = LIS->getInstructionFromIndex(SI);
if (MI->readsRegister(DstReg, TRI)) {
DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
return false;
}
// We must also check for clobbers caused by regmasks.
for (const auto &MO : MI->operands()) {
if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
return false;
}
}
}
}
// We're going to remove the copy which defines a physical reserved
// register, so remove its valno, etc.
- DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
- << CopyRegIdx << "\n");
+ DEBUG(dbgs() << "\t\tRemoving phys reg def of " << PrintReg(DstReg, TRI)
+ << " at " << CopyRegIdx << "\n");
LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
// Create a new dead def at the new def location.
for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
LiveRange &LR = LIS->getRegUnit(*UI);
LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
}
}
LIS->RemoveMachineInstrFromMaps(*CopyMI);
CopyMI->eraseFromParent();
// We don't track kills for reserved registers.
MRI->clearKillFlags(CP.getSrcReg());
return true;
}
//===----------------------------------------------------------------------===//
// Interference checking and interval joining
//===----------------------------------------------------------------------===//
//
// In the easiest case, the two live ranges being joined are disjoint, and
// there is no interference to consider. It is quite common, though, to have
// overlapping live ranges, and we need to check if the interference can be
// resolved.
//
// The live range of a single SSA value forms a sub-tree of the dominator tree.
// This means that two SSA values overlap if and only if the def of one value
// is contained in the live range of the other value. As a special case, the
// overlapping values can be defined at the same index.
//
// The interference from an overlapping def can be resolved in these cases:
//
// 1. Coalescable copies. The value is defined by a copy that would become an
// identity copy after joining SrcReg and DstReg. The copy instruction will
// be removed, and the value will be merged with the source value.
//
// There can be several copies back and forth, causing many values to be
// merged into one. We compute a list of ultimate values in the joined live
// range as well as a mappings from the old value numbers.
//
// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
// predecessors have a live out value. It doesn't cause real interference,
// and can be merged into the value it overlaps. Like a coalescable copy, it
// can be erased after joining.
//
// 3. Copy of external value. The overlapping def may be a copy of a value that
// is already in the other register. This is like a coalescable copy, but
// the live range of the source register must be trimmed after erasing the
// copy instruction:
//
// %src = COPY %ext
// %dst = COPY %ext <-- Remove this COPY, trim the live range of %ext.
//
// 4. Clobbering undefined lanes. Vector registers are sometimes built by
// defining one lane at a time:
//
// %dst:ssub0<def,read-undef> = FOO
// %src = BAR
// %dst:ssub1<def> = COPY %src
//
// The live range of %src overlaps the %dst value defined by FOO, but
// merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
// which was undef anyway.
//
// The value mapping is more complicated in this case. The final live range
// will have different value numbers for both FOO and BAR, but there is no
// simple mapping from old to new values. It may even be necessary to add
// new PHI values.
//
// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
// is live, but never read. This can happen because we don't compute
// individual live ranges per lane.
//
// %dst<def> = FOO
// %src = BAR
// %dst:ssub1<def> = COPY %src
//
// This kind of interference is only resolved locally. If the clobbered
// lane value escapes the block, the join is aborted.
namespace {
/// Track information about values in a single virtual register about to be
/// joined. Objects of this class are always created in pairs - one for each
/// side of the CoalescerPair (or one for each lane of a side of the coalescer
/// pair)
class JoinVals {
/// Live range we work on.
LiveRange &LR;
/// (Main) register we work on.
const unsigned Reg;
/// Reg (and therefore the values in this liverange) will end up as
/// subregister SubIdx in the coalesced register. Either CP.DstIdx or
/// CP.SrcIdx.
const unsigned SubIdx;
/// The LaneMask that this liverange will occupy the coalesced register. May
/// be smaller than the lanemask produced by SubIdx when merging subranges.
const LaneBitmask LaneMask;
/// This is true when joining sub register ranges, false when joining main
/// ranges.
const bool SubRangeJoin;
/// Whether the current LiveInterval tracks subregister liveness.
const bool TrackSubRegLiveness;
/// Values that will be present in the final live range.
SmallVectorImpl<VNInfo*> &NewVNInfo;
const CoalescerPair &CP;
LiveIntervals *LIS;
SlotIndexes *Indexes;
const TargetRegisterInfo *TRI;
/// Value number assignments. Maps value numbers in LI to entries in
/// NewVNInfo. This is suitable for passing to LiveInterval::join().
SmallVector<int, 8> Assignments;
/// Conflict resolution for overlapping values.
enum ConflictResolution {
/// No overlap, simply keep this value.
CR_Keep,
/// Merge this value into OtherVNI and erase the defining instruction.
/// Used for IMPLICIT_DEF, coalescable copies, and copies from external
/// values.
CR_Erase,
/// Merge this value into OtherVNI but keep the defining instruction.
/// This is for the special case where OtherVNI is defined by the same
/// instruction.
CR_Merge,
/// Keep this value, and have it replace OtherVNI where possible. This
/// complicates value mapping since OtherVNI maps to two different values
/// before and after this def.
/// Used when clobbering undefined or dead lanes.
CR_Replace,
/// Unresolved conflict. Visit later when all values have been mapped.
CR_Unresolved,
/// Unresolvable conflict. Abort the join.
CR_Impossible
};
/// Per-value info for LI. The lane bit masks are all relative to the final
/// joined register, so they can be compared directly between SrcReg and
/// DstReg.
struct Val {
ConflictResolution Resolution;
/// Lanes written by this def, 0 for unanalyzed values.
LaneBitmask WriteLanes;
/// Lanes with defined values in this register. Other lanes are undef and
/// safe to clobber.
LaneBitmask ValidLanes;
/// Value in LI being redefined by this def.
VNInfo *RedefVNI;
/// Value in the other live range that overlaps this def, if any.
VNInfo *OtherVNI;
/// Is this value an IMPLICIT_DEF that can be erased?
///
/// IMPLICIT_DEF values should only exist at the end of a basic block that
/// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
/// safely erased if they are overlapping a live value in the other live
/// interval.
///
/// Weird control flow graphs and incomplete PHI handling in
/// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
/// longer live ranges. Such IMPLICIT_DEF values should be treated like
/// normal values.
bool ErasableImplicitDef;
/// True when the live range of this value will be pruned because of an
/// overlapping CR_Replace value in the other live range.
bool Pruned;
/// True once Pruned above has been computed.
bool PrunedComputed;
Val() : Resolution(CR_Keep), WriteLanes(), ValidLanes(),
RedefVNI(nullptr), OtherVNI(nullptr), ErasableImplicitDef(false),
Pruned(false), PrunedComputed(false) {}
bool isAnalyzed() const { return WriteLanes.any(); }
};
/// One entry per value number in LI.
SmallVector<Val, 8> Vals;
/// Compute the bitmask of lanes actually written by DefMI.
/// Set Redef if there are any partial register definitions that depend on the
/// previous value of the register.
LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
/// Find the ultimate value that VNI was copied from.
std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
/// Return a conflict resolution when possible, but leave the hard cases as
/// CR_Unresolved.
/// Recursively calls computeAssignment() on this and Other, guaranteeing that
/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
/// The recursion always goes upwards in the dominator tree, making loops
/// impossible.
ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
/// Compute the value assignment for ValNo in RI.
/// This may be called recursively by analyzeValue(), but never for a ValNo on
/// the stack.
void computeAssignment(unsigned ValNo, JoinVals &Other);
/// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
/// the extent of the tainted lanes in the block.
///
/// Multiple values in Other.LR can be affected since partial redefinitions
/// can preserve previously tainted lanes.
///
/// 1 %dst = VLOAD <-- Define all lanes in %dst
/// 2 %src = FOO <-- ValNo to be joined with %dst:ssub0
/// 3 %dst:ssub1 = BAR <-- Partial redef doesn't clear taint in ssub0
/// 4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
///
/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
/// entry to TaintedVals.
///
/// Returns false if the tainted lanes extend beyond the basic block.
bool taintExtent(unsigned, LaneBitmask, JoinVals&,
SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> >&);
/// Return true if MI uses any of the given Lanes from Reg.
/// This does not include partial redefinitions of Reg.
bool usesLanes(const MachineInstr &MI, unsigned, unsigned, LaneBitmask) const;
/// Determine if ValNo is a copy of a value number in LR or Other.LR that will
/// be pruned:
///
/// %dst = COPY %src
/// %src = COPY %dst <-- This value to be pruned.
/// %dst = COPY %src <-- This value is a copy of a pruned value.
bool isPrunedValue(unsigned ValNo, JoinVals &Other);
public:
JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask,
SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
bool TrackSubRegLiveness)
: LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
{}
/// Analyze defs in LR and compute a value mapping in NewVNInfo.
/// Returns false if any conflicts were impossible to resolve.
bool mapValues(JoinVals &Other);
/// Try to resolve conflicts that require all values to be mapped.
/// Returns false if any conflicts were impossible to resolve.
bool resolveConflicts(JoinVals &Other);
/// Prune the live range of values in Other.LR where they would conflict with
/// CR_Replace values in LR. Collect end points for restoring the live range
/// after joining.
void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
bool changeInstrs);
/// Removes subranges starting at copies that get removed. This sometimes
/// happens when undefined subranges are copied around. These ranges contain
/// no useful information and can be removed.
void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask);
/// Pruning values in subranges can lead to removing segments in these
/// subranges started by IMPLICIT_DEFs. The corresponding segments in
/// the main range also need to be removed. This function will mark
/// the corresponding values in the main range as pruned, so that
/// eraseInstrs can do the final cleanup.
/// The parameter @p LI must be the interval whose main range is the
/// live range LR.
void pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange);
/// Erase any machine instructions that have been coalesced away.
/// Add erased instructions to ErasedInstrs.
/// Add foreign virtual registers to ShrinkRegs if their live range ended at
/// the erased instrs.
void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
SmallVectorImpl<unsigned> &ShrinkRegs,
LiveInterval *LI = nullptr);
/// Remove liverange defs at places where implicit defs will be removed.
void removeImplicitDefs();
/// Get the value assignments suitable for passing to LiveInterval::join.
const int *getAssignments() const { return Assignments.data(); }
};
} // end anonymous namespace
LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
const {
LaneBitmask L;
for (const MachineOperand &MO : DefMI->operands()) {
if (!MO.isReg() || MO.getReg() != Reg || !MO.isDef())
continue;
L |= TRI->getSubRegIndexLaneMask(
TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
if (MO.readsReg())
Redef = true;
}
return L;
}
std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
const VNInfo *VNI) const {
unsigned Reg = this->Reg;
while (!VNI->isPHIDef()) {
SlotIndex Def = VNI->def;
MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
assert(MI && "No defining instruction");
if (!MI->isFullCopy())
return std::make_pair(VNI, Reg);
unsigned SrcReg = MI->getOperand(1).getReg();
if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
return std::make_pair(VNI, Reg);
const LiveInterval &LI = LIS->getInterval(SrcReg);
const VNInfo *ValueIn;
// No subrange involved.
if (!SubRangeJoin || !LI.hasSubRanges()) {
LiveQueryResult LRQ = LI.Query(Def);
ValueIn = LRQ.valueIn();
} else {
// Query subranges. Pick the first matching one.
ValueIn = nullptr;
for (const LiveInterval::SubRange &S : LI.subranges()) {
// Transform lanemask to a mask in the joined live interval.
LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
if ((SMask & LaneMask).none())
continue;
LiveQueryResult LRQ = S.Query(Def);
ValueIn = LRQ.valueIn();
break;
}
}
if (ValueIn == nullptr)
break;
VNI = ValueIn;
Reg = SrcReg;
}
return std::make_pair(VNI, Reg);
}
bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
const JoinVals &Other) const {
const VNInfo *Orig0;
unsigned Reg0;
std::tie(Orig0, Reg0) = followCopyChain(Value0);
if (Orig0 == Value1)
return true;
const VNInfo *Orig1;
unsigned Reg1;
std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
// The values are equal if they are defined at the same place and use the
// same register. Note that we cannot compare VNInfos directly as some of
// them might be from a copy created in mergeSubRangeInto() while the other
// is from the original LiveInterval.
return Orig0->def == Orig1->def && Reg0 == Reg1;
}
JoinVals::ConflictResolution
JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
Val &V = Vals[ValNo];
assert(!V.isAnalyzed() && "Value has already been analyzed!");
VNInfo *VNI = LR.getValNumInfo(ValNo);
if (VNI->isUnused()) {
V.WriteLanes = LaneBitmask::getAll();
return CR_Keep;
}
// Get the instruction defining this value, compute the lanes written.
const MachineInstr *DefMI = nullptr;
if (VNI->isPHIDef()) {
// Conservatively assume that all lanes in a PHI are valid.
LaneBitmask Lanes = SubRangeJoin ? LaneBitmask(1)
: TRI->getSubRegIndexLaneMask(SubIdx);
V.ValidLanes = V.WriteLanes = Lanes;
} else {
DefMI = Indexes->getInstructionFromIndex(VNI->def);
assert(DefMI != nullptr);
if (SubRangeJoin) {
// We don't care about the lanes when joining subregister ranges.
V.WriteLanes = V.ValidLanes = LaneBitmask(1);
if (DefMI->isImplicitDef()) {
V.ValidLanes = LaneBitmask::getNone();
V.ErasableImplicitDef = true;
}
} else {
bool Redef = false;
V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
// If this is a read-modify-write instruction, there may be more valid
// lanes than the ones written by this instruction.
// This only covers partial redef operands. DefMI may have normal use
// operands reading the register. They don't contribute valid lanes.
//
// This adds ssub1 to the set of valid lanes in %src:
//
// %src:ssub1<def> = FOO
//
// This leaves only ssub1 valid, making any other lanes undef:
//
// %src:ssub1<def,read-undef> = FOO %src:ssub2
//
// The <read-undef> flag on the def operand means that old lane values are
// not important.
if (Redef) {
V.RedefVNI = LR.Query(VNI->def).valueIn();
assert((TrackSubRegLiveness || V.RedefVNI) &&
"Instruction is reading nonexistent value");
if (V.RedefVNI != nullptr) {
computeAssignment(V.RedefVNI->id, Other);
V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
}
}
// An IMPLICIT_DEF writes undef values.
if (DefMI->isImplicitDef()) {
// We normally expect IMPLICIT_DEF values to be live only until the end
// of their block. If the value is really live longer and gets pruned in
// another block, this flag is cleared again.
V.ErasableImplicitDef = true;
V.ValidLanes &= ~V.WriteLanes;
}
}
}
// Find the value in Other that overlaps VNI->def, if any.
LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
// It is possible that both values are defined by the same instruction, or
// the values are PHIs defined in the same block. When that happens, the two
// values should be merged into one, but not into any preceding value.
// The first value defined or visited gets CR_Keep, the other gets CR_Merge.
if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
// One value stays, the other is merged. Keep the earlier one, or the first
// one we see.
if (OtherVNI->def < VNI->def)
Other.computeAssignment(OtherVNI->id, *this);
else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
// This is an early-clobber def overlapping a live-in value in the other
// register. Not mergeable.
V.OtherVNI = OtherLRQ.valueIn();
return CR_Impossible;
}
V.OtherVNI = OtherVNI;
Val &OtherV = Other.Vals[OtherVNI->id];
// Keep this value, check for conflicts when analyzing OtherVNI.
if (!OtherV.isAnalyzed())
return CR_Keep;
// Both sides have been analyzed now.
// Allow overlapping PHI values. Any real interference would show up in a
// predecessor, the PHI itself can't introduce any conflicts.
if (VNI->isPHIDef())
return CR_Merge;
if ((V.ValidLanes & OtherV.ValidLanes).any())
// Overlapping lanes can't be resolved.
return CR_Impossible;
else
return CR_Merge;
}
// No simultaneous def. Is Other live at the def?
V.OtherVNI = OtherLRQ.valueIn();
if (!V.OtherVNI)
// No overlap, no conflict.
return CR_Keep;
assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
// We have overlapping values, or possibly a kill of Other.
// Recursively compute assignments up the dominator tree.
Other.computeAssignment(V.OtherVNI->id, *this);
Val &OtherV = Other.Vals[V.OtherVNI->id];
// Check if OtherV is an IMPLICIT_DEF that extends beyond its basic block.
// This shouldn't normally happen, but ProcessImplicitDefs can leave such
// IMPLICIT_DEF instructions behind, and there is nothing wrong with it
// technically.
//
// When it happens, treat that IMPLICIT_DEF as a normal value, and don't try
// to erase the IMPLICIT_DEF instruction.
if (OtherV.ErasableImplicitDef && DefMI &&
DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
<< " extends into BB#" << DefMI->getParent()->getNumber()
<< ", keeping it.\n");
OtherV.ErasableImplicitDef = false;
}
// Allow overlapping PHI values. Any real interference would show up in a
// predecessor, the PHI itself can't introduce any conflicts.
if (VNI->isPHIDef())
return CR_Replace;
// Check for simple erasable conflicts.
if (DefMI->isImplicitDef()) {
// We need the def for the subregister if there is nothing else live at the
// subrange at this point.
if (TrackSubRegLiveness
&& (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)).none())
return CR_Replace;
return CR_Erase;
}
// Include the non-conflict where DefMI is a coalescable copy that kills
// OtherVNI. We still want the copy erased and value numbers merged.
if (CP.isCoalescable(DefMI)) {
// Some of the lanes copied from OtherVNI may be undef, making them undef
// here too.
V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
return CR_Erase;
}
// This may not be a real conflict if DefMI simply kills Other and defines
// VNI.
if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
return CR_Keep;
// Handle the case where VNI and OtherVNI can be proven to be identical:
//
// %other = COPY %ext
// %this = COPY %ext <-- Erase this copy
//
if (DefMI->isFullCopy() && !CP.isPartial()
&& valuesIdentical(VNI, V.OtherVNI, Other))
return CR_Erase;
// If the lanes written by this instruction were all undef in OtherVNI, it is
// still safe to join the live ranges. This can't be done with a simple value
// mapping, though - OtherVNI will map to multiple values:
//
// 1 %dst:ssub0 = FOO <-- OtherVNI
// 2 %src = BAR <-- VNI
// 3 %dst:ssub1 = COPY %src<kill> <-- Eliminate this copy.
// 4 BAZ %dst<kill>
// 5 QUUX %src<kill>
//
// Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
// handles this complex value mapping.
if ((V.WriteLanes & OtherV.ValidLanes).none())
return CR_Replace;
// If the other live range is killed by DefMI and the live ranges are still
// overlapping, it must be because we're looking at an early clobber def:
//
// %dst<def,early-clobber> = ASM %src<kill>
//
// In this case, it is illegal to merge the two live ranges since the early
// clobber def would clobber %src before it was read.
if (OtherLRQ.isKill()) {
// This case where the def doesn't overlap the kill is handled above.
assert(VNI->def.isEarlyClobber() &&
"Only early clobber defs can overlap a kill");
return CR_Impossible;
}
// VNI is clobbering live lanes in OtherVNI, but there is still the
// possibility that no instructions actually read the clobbered lanes.
// If we're clobbering all the lanes in OtherVNI, at least one must be read.
// Otherwise Other.RI wouldn't be live here.
if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes).none())
return CR_Impossible;
// We need to verify that no instructions are reading the clobbered lanes. To
// save compile time, we'll only check that locally. Don't allow the tainted
// value to escape the basic block.
MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
return CR_Impossible;
// There are still some things that could go wrong besides clobbered lanes
// being read, for example OtherVNI may be only partially redefined in MBB,
// and some clobbered lanes could escape the block. Save this analysis for
// resolveConflicts() when all values have been mapped. We need to know
// RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
// that now - the recursive analyzeValue() calls must go upwards in the
// dominator tree.
return CR_Unresolved;
}
void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
Val &V = Vals[ValNo];
if (V.isAnalyzed()) {
// Recursion should always move up the dominator tree, so ValNo is not
// supposed to reappear before it has been assigned.
assert(Assignments[ValNo] != -1 && "Bad recursion?");
return;
}
switch ((V.Resolution = analyzeValue(ValNo, Other))) {
case CR_Erase:
case CR_Merge:
// Merge this ValNo into OtherVNI.
assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
<< LR.getValNumInfo(ValNo)->def << " into "
<< PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
<< V.OtherVNI->def << " --> @"
<< NewVNInfo[Assignments[ValNo]]->def << '\n');
break;
case CR_Replace:
case CR_Unresolved: {
// The other value is going to be pruned if this join is successful.
assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
Val &OtherV = Other.Vals[V.OtherVNI->id];
// We cannot erase an IMPLICIT_DEF if we don't have valid values for all
// its lanes.
if ((OtherV.WriteLanes & ~V.ValidLanes).any() && TrackSubRegLiveness)
OtherV.ErasableImplicitDef = false;
OtherV.Pruned = true;
LLVM_FALLTHROUGH;
}
default:
// This value number needs to go in the final joined live range.
Assignments[ValNo] = NewVNInfo.size();
NewVNInfo.push_back(LR.getValNumInfo(ValNo));
break;
}
}
bool JoinVals::mapValues(JoinVals &Other) {
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
computeAssignment(i, Other);
if (Vals[i].Resolution == CR_Impossible) {
DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
<< '@' << LR.getValNumInfo(i)->def << '\n');
return false;
}
}
return true;
}
bool JoinVals::
taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> > &TaintExtent) {
VNInfo *VNI = LR.getValNumInfo(ValNo);
MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
// Scan Other.LR from VNI.def to MBBEnd.
LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
assert(OtherI != Other.LR.end() && "No conflict?");
do {
// OtherI is pointing to a tainted value. Abort the join if the tainted
// lanes escape the block.
SlotIndex End = OtherI->end;
if (End >= MBBEnd) {
DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
<< OtherI->valno->id << '@' << OtherI->start << '\n');
return false;
}
DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
<< OtherI->valno->id << '@' << OtherI->start
<< " to " << End << '\n');
// A dead def is not a problem.
if (End.isDead())
break;
TaintExtent.push_back(std::make_pair(End, TaintedLanes));
// Check for another def in the MBB.
if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
break;
// Lanes written by the new def are no longer tainted.
const Val &OV = Other.Vals[OtherI->valno->id];
TaintedLanes &= ~OV.WriteLanes;
if (!OV.RedefVNI)
break;
} while (TaintedLanes.any());
return true;
}
bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
LaneBitmask Lanes) const {
if (MI.isDebugValue())
return false;
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg() || MO.isDef() || MO.getReg() != Reg)
continue;
if (!MO.readsReg())
continue;
unsigned S = TRI->composeSubRegIndices(SubIdx, MO.getSubReg());
if ((Lanes & TRI->getSubRegIndexLaneMask(S)).any())
return true;
}
return false;
}
bool JoinVals::resolveConflicts(JoinVals &Other) {
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
Val &V = Vals[i];
assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
if (V.Resolution != CR_Unresolved)
continue;
DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
<< '@' << LR.getValNumInfo(i)->def << '\n');
if (SubRangeJoin)
return false;
++NumLaneConflicts;
assert(V.OtherVNI && "Inconsistent conflict resolution.");
VNInfo *VNI = LR.getValNumInfo(i);
const Val &OtherV = Other.Vals[V.OtherVNI->id];
// VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
// join, those lanes will be tainted with a wrong value. Get the extent of
// the tainted lanes.
LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent;
if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
// Tainted lanes would extend beyond the basic block.
return false;
assert(!TaintExtent.empty() && "There should be at least one conflict.");
// Now look at the instructions from VNI->def to TaintExtent (inclusive).
MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
MachineBasicBlock::iterator MI = MBB->begin();
if (!VNI->isPHIDef()) {
MI = Indexes->getInstructionFromIndex(VNI->def);
// No need to check the instruction defining VNI for reads.
++MI;
}
assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
"Interference ends on VNI->def. Should have been handled earlier");
MachineInstr *LastMI =
Indexes->getInstructionFromIndex(TaintExtent.front().first);
assert(LastMI && "Range must end at a proper instruction");
unsigned TaintNum = 0;
for (;;) {
assert(MI != MBB->end() && "Bad LastMI");
if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
return false;
}
// LastMI is the last instruction to use the current value.
if (&*MI == LastMI) {
if (++TaintNum == TaintExtent.size())
break;
LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
assert(LastMI && "Range must end at a proper instruction");
TaintedLanes = TaintExtent[TaintNum].second;
}
++MI;
}
// The tainted lanes are unused.
V.Resolution = CR_Replace;
++NumLaneResolves;
}
return true;
}
bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
Val &V = Vals[ValNo];
if (V.Pruned || V.PrunedComputed)
return V.Pruned;
if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
return V.Pruned;
// Follow copies up the dominator tree and check if any intermediate value
// has been pruned.
V.PrunedComputed = true;
V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
return V.Pruned;
}
void JoinVals::pruneValues(JoinVals &Other,
SmallVectorImpl<SlotIndex> &EndPoints,
bool changeInstrs) {
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
SlotIndex Def = LR.getValNumInfo(i)->def;
switch (Vals[i].Resolution) {
case CR_Keep:
break;
case CR_Replace: {
// This value takes precedence over the value in Other.LR.
LIS->pruneValue(Other.LR, Def, &EndPoints);
// Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
// instructions are only inserted to provide a live-out value for PHI
// predecessors, so the instruction should simply go away once its value
// has been replaced.
Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
bool EraseImpDef = OtherV.ErasableImplicitDef &&
OtherV.Resolution == CR_Keep;
if (!Def.isBlock()) {
if (changeInstrs) {
// Remove <def,read-undef> flags. This def is now a partial redef.
// Also remove <def,dead> flags since the joined live range will
// continue past this instruction.
for (MachineOperand &MO :
Indexes->getInstructionFromIndex(Def)->operands()) {
if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
if (MO.getSubReg() != 0)
MO.setIsUndef(EraseImpDef);
MO.setIsDead(false);
}
}
}
// This value will reach instructions below, but we need to make sure
// the live range also reaches the instruction at Def.
if (!EraseImpDef)
EndPoints.push_back(Def);
}
DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
<< ": " << Other.LR << '\n');
break;
}
case CR_Erase:
case CR_Merge:
if (isPrunedValue(i, Other)) {
// This value is ultimately a copy of a pruned value in LR or Other.LR.
// We can no longer trust the value mapping computed by
// computeAssignment(), the value that was originally copied could have
// been replaced.
LIS->pruneValue(LR, Def, &EndPoints);
DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
<< Def << ": " << LR << '\n');
}
break;
case CR_Unresolved:
case CR_Impossible:
llvm_unreachable("Unresolved conflicts");
}
}
}
void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) {
// Look for values being erased.
bool DidPrune = false;
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
if (Vals[i].Resolution != CR_Erase)
continue;
// Check subranges at the point where the copy will be removed.
SlotIndex Def = LR.getValNumInfo(i)->def;
for (LiveInterval::SubRange &S : LI.subranges()) {
LiveQueryResult Q = S.Query(Def);
// If a subrange starts at the copy then an undefined value has been
// copied and we must remove that subrange value as well.
VNInfo *ValueOut = Q.valueOutOrDead();
if (ValueOut != nullptr && Q.valueIn() == nullptr) {
DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
<< " at " << Def << "\n");
LIS->pruneValue(S, Def, nullptr);
DidPrune = true;
// Mark value number as unused.
ValueOut->markUnused();
continue;
}
// If a subrange ends at the copy, then a value was copied but only
// partially used later. Shrink the subregister range appropriately.
if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask)
<< " at " << Def << "\n");
ShrinkMask |= S.LaneMask;
}
}
}
if (DidPrune)
LI.removeEmptySubRanges();
}
/// Check if any of the subranges of @p LI contain a definition at @p Def.
static bool isDefInSubRange(LiveInterval &LI, SlotIndex Def) {
for (LiveInterval::SubRange &SR : LI.subranges()) {
if (VNInfo *VNI = SR.Query(Def).valueOutOrDead())
if (VNI->def == Def)
return true;
}
return false;
}
void JoinVals::pruneMainSegments(LiveInterval &LI, bool &ShrinkMainRange) {
assert(&static_cast<LiveRange&>(LI) == &LR);
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
if (Vals[i].Resolution != CR_Keep)
continue;
VNInfo *VNI = LR.getValNumInfo(i);
if (VNI->isUnused() || VNI->isPHIDef() || isDefInSubRange(LI, VNI->def))
continue;
Vals[i].Pruned = true;
ShrinkMainRange = true;
}
}
void JoinVals::removeImplicitDefs() {
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
Val &V = Vals[i];
if (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned)
continue;
VNInfo *VNI = LR.getValNumInfo(i);
VNI->markUnused();
LR.removeValNo(VNI);
}
}
void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
SmallVectorImpl<unsigned> &ShrinkRegs,
LiveInterval *LI) {
for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
// Get the def location before markUnused() below invalidates it.
SlotIndex Def = LR.getValNumInfo(i)->def;
switch (Vals[i].Resolution) {
case CR_Keep: {
// If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
// longer. The IMPLICIT_DEF instructions are only inserted by
// PHIElimination to guarantee that all PHI predecessors have a value.
if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
break;
// Remove value number i from LR.
// For intervals with subranges, removing a segment from the main range
// may require extending the previous segment: for each definition of
// a subregister, there will be a corresponding def in the main range.
// That def may fall in the middle of a segment from another subrange.
// In such cases, removing this def from the main range must be
// complemented by extending the main range to account for the liveness
// of the other subrange.
VNInfo *VNI = LR.getValNumInfo(i);
SlotIndex Def = VNI->def;
// The new end point of the main range segment to be extended.
SlotIndex NewEnd;
if (LI != nullptr) {
LiveRange::iterator I = LR.FindSegmentContaining(Def);
assert(I != LR.end());
// Do not extend beyond the end of the segment being removed.
// The segment may have been pruned in preparation for joining
// live ranges.
NewEnd = I->end;
}
LR.removeValNo(VNI);
// Note that this VNInfo is reused and still referenced in NewVNInfo,
// make it appear like an unused value number.
VNI->markUnused();
if (LI != nullptr && LI->hasSubRanges()) {
assert(static_cast<LiveRange*>(LI) == &LR);
// Determine the end point based on the subrange information:
// minimum of (earliest def of next segment,
// latest end point of containing segment)
SlotIndex ED, LE;
for (LiveInterval::SubRange &SR : LI->subranges()) {
LiveRange::iterator I = SR.find(Def);
if (I == SR.end())
continue;
if (I->start > Def)
ED = ED.isValid() ? std::min(ED, I->start) : I->start;
else
LE = LE.isValid() ? std::max(LE, I->end) : I->end;
}
if (LE.isValid())
NewEnd = std::min(NewEnd, LE);
if (ED.isValid())
NewEnd = std::min(NewEnd, ED);
// We only want to do the extension if there was a subrange that
// was live across Def.
if (LE.isValid()) {
LiveRange::iterator S = LR.find(Def);
if (S != LR.begin())
std::prev(S)->end = NewEnd;
}
}
DEBUG({
dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n';
if (LI != nullptr)
dbgs() << "\t\t LHS = " << *LI << '\n';
});
LLVM_FALLTHROUGH;
}
case CR_Erase: {
MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
assert(MI && "No instruction to erase");
if (MI->isCopy()) {
unsigned Reg = MI->getOperand(1).getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg) &&
Reg != CP.getSrcReg() && Reg != CP.getDstReg())
ShrinkRegs.push_back(Reg);
}
ErasedInstrs.insert(MI);
DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
LIS->RemoveMachineInstrFromMaps(*MI);
MI->eraseFromParent();
break;
}
default:
break;
}
}
}
void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
LaneBitmask LaneMask,
const CoalescerPair &CP) {
SmallVector<VNInfo*, 16> NewVNInfo;
JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
NewVNInfo, CP, LIS, TRI, true, true);
JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
NewVNInfo, CP, LIS, TRI, true, true);
// Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
// We should be able to resolve all conflicts here as we could successfully do
// it on the mainrange already. There is however a problem when multiple
// ranges get mapped to the "overflow" lane mask bit which creates unexpected
// interferences.
if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals)) {
// We already determined that it is legal to merge the intervals, so this
// should never fail.
llvm_unreachable("*** Couldn't join subrange!\n");
}
if (!LHSVals.resolveConflicts(RHSVals) ||
!RHSVals.resolveConflicts(LHSVals)) {
// We already determined that it is legal to merge the intervals, so this
// should never fail.
llvm_unreachable("*** Couldn't join subrange!\n");
}
// The merging algorithm in LiveInterval::join() can't handle conflicting
// value mappings, so we need to remove any live ranges that overlap a
// CR_Replace resolution. Collect a set of end points that can be used to
// restore the live range after joining.
SmallVector<SlotIndex, 8> EndPoints;
LHSVals.pruneValues(RHSVals, EndPoints, false);
RHSVals.pruneValues(LHSVals, EndPoints, false);
LHSVals.removeImplicitDefs();
RHSVals.removeImplicitDefs();
LRange.verify();
RRange.verify();
// Join RRange into LHS.
LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
NewVNInfo);
DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
if (EndPoints.empty())
return;
// Recompute the parts of the live range we had to remove because of
// CR_Replace conflicts.
DEBUG({
dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
dbgs() << EndPoints[i];
if (i != n-1)
dbgs() << ',';
}
dbgs() << ": " << LRange << '\n';
});
LIS->extendToIndices(LRange, EndPoints);
}
void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
const LiveRange &ToMerge,
LaneBitmask LaneMask,
CoalescerPair &CP) {
BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
for (LiveInterval::SubRange &R : LI.subranges()) {
LaneBitmask RMask = R.LaneMask;
// LaneMask of subregisters common to subrange R and ToMerge.
LaneBitmask Common = RMask & LaneMask;
// There is nothing to do without common subregs.
if (Common.none())
continue;
DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
<< PrintLaneMask(Common) << '\n');
// LaneMask of subregisters contained in the R range but not in ToMerge,
// they have to split into their own subrange.
LaneBitmask LRest = RMask & ~LaneMask;
LiveInterval::SubRange *CommonRange;
if (LRest.any()) {
R.LaneMask = LRest;
DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
// Duplicate SubRange for newly merged common stuff.
CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
} else {
// Reuse the existing range.
R.LaneMask = Common;
CommonRange = &R;
}
LiveRange RangeCopy(ToMerge, Allocator);
joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
LaneMask &= ~RMask;
}
if (LaneMask.any()) {
DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
}
}
bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
SmallVector<VNInfo*, 16> NewVNInfo;
LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC());
JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), LaneBitmask::getNone(),
NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), LaneBitmask::getNone(),
NewVNInfo, CP, LIS, TRI, false, TrackSubRegLiveness);
DEBUG(dbgs() << "\t\tRHS = " << RHS
<< "\n\t\tLHS = " << LHS
<< '\n');
// First compute NewVNInfo and the simple value mappings.
// Detect impossible conflicts early.
if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
return false;
// Some conflicts can only be resolved after all values have been mapped.
if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
return false;
// All clear, the live ranges can be merged.
if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
// Transform lanemasks from the LHS to masks in the coalesced register and
// create initial subranges if necessary.
unsigned DstIdx = CP.getDstIdx();
if (!LHS.hasSubRanges()) {
LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
: TRI->getSubRegIndexLaneMask(DstIdx);
// LHS must support subregs or we wouldn't be in this codepath.
assert(Mask.any());
LHS.createSubRangeFrom(Allocator, Mask, LHS);
} else if (DstIdx != 0) {
// Transform LHS lanemasks to new register class if necessary.
for (LiveInterval::SubRange &R : LHS.subranges()) {
LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
R.LaneMask = Mask;
}
}
DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
<< ' ' << LHS << '\n');
// Determine lanemasks of RHS in the coalesced register and merge subranges.
unsigned SrcIdx = CP.getSrcIdx();
if (!RHS.hasSubRanges()) {
LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
: TRI->getSubRegIndexLaneMask(SrcIdx);
mergeSubRangeInto(LHS, RHS, Mask, CP);
} else {
// Pair up subranges and merge.
for (LiveInterval::SubRange &R : RHS.subranges()) {
LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
mergeSubRangeInto(LHS, R, Mask, CP);
}
}
DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
// Pruning implicit defs from subranges may result in the main range
// having stale segments.
LHSVals.pruneMainSegments(LHS, ShrinkMainRange);
LHSVals.pruneSubRegValues(LHS, ShrinkMask);
RHSVals.pruneSubRegValues(LHS, ShrinkMask);
}
// The merging algorithm in LiveInterval::join() can't handle conflicting
// value mappings, so we need to remove any live ranges that overlap a
// CR_Replace resolution. Collect a set of end points that can be used to
// restore the live range after joining.
SmallVector<SlotIndex, 8> EndPoints;
LHSVals.pruneValues(RHSVals, EndPoints, true);
RHSVals.pruneValues(LHSVals, EndPoints, true);
// Erase COPY and IMPLICIT_DEF instructions. This may cause some external
// registers to require trimming.
SmallVector<unsigned, 8> ShrinkRegs;
LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs, &LHS);
RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
while (!ShrinkRegs.empty())
shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
// Join RHS into LHS.
LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
// Kill flags are going to be wrong if the live ranges were overlapping.
// Eventually, we should simply clear all kill flags when computing live
// ranges. They are reinserted after register allocation.
MRI->clearKillFlags(LHS.reg);
MRI->clearKillFlags(RHS.reg);
if (!EndPoints.empty()) {
// Recompute the parts of the live range we had to remove because of
// CR_Replace conflicts.
DEBUG({
dbgs() << "\t\trestoring liveness to " << EndPoints.size() << " points: ";
for (unsigned i = 0, n = EndPoints.size(); i != n; ++i) {
dbgs() << EndPoints[i];
if (i != n-1)
dbgs() << ',';
}
dbgs() << ": " << LHS << '\n';
});
LIS->extendToIndices((LiveRange&)LHS, EndPoints);
}
return true;
}
bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
}
namespace {
/// Information concerning MBB coalescing priority.
struct MBBPriorityInfo {
MachineBasicBlock *MBB;
unsigned Depth;
bool IsSplit;
MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
: MBB(mbb), Depth(depth), IsSplit(issplit) {}
};
}
/// C-style comparator that sorts first based on the loop depth of the basic
/// block (the unsigned), and then on the MBB number.
///
/// EnableGlobalCopies assumes that the primary sort key is loop depth.
static int compareMBBPriority(const MBBPriorityInfo *LHS,
const MBBPriorityInfo *RHS) {
// Deeper loops first
if (LHS->Depth != RHS->Depth)
return LHS->Depth > RHS->Depth ? -1 : 1;
// Try to unsplit critical edges next.
if (LHS->IsSplit != RHS->IsSplit)
return LHS->IsSplit ? -1 : 1;
// Prefer blocks that are more connected in the CFG. This takes care of
// the most difficult copies first while intervals are short.
unsigned cl = LHS->MBB->pred_size() + LHS->MBB->succ_size();
unsigned cr = RHS->MBB->pred_size() + RHS->MBB->succ_size();
if (cl != cr)
return cl > cr ? -1 : 1;
// As a last resort, sort by block number.
return LHS->MBB->getNumber() < RHS->MBB->getNumber() ? -1 : 1;
}
/// \returns true if the given copy uses or defines a local live range.
static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
if (!Copy->isCopy())
return false;
if (Copy->getOperand(1).isUndef())
return false;
unsigned SrcReg = Copy->getOperand(1).getReg();
unsigned DstReg = Copy->getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
|| TargetRegisterInfo::isPhysicalRegister(DstReg))
return false;
return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
|| LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
}
bool RegisterCoalescer::
copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
bool Progress = false;
for (unsigned i = 0, e = CurrList.size(); i != e; ++i) {
if (!CurrList[i])
continue;
// Skip instruction pointers that have already been erased, for example by
// dead code elimination.
if (ErasedInstrs.erase(CurrList[i])) {
CurrList[i] = nullptr;
continue;
}
bool Again = false;
bool Success = joinCopy(CurrList[i], Again);
Progress |= Success;
if (Success || !Again)
CurrList[i] = nullptr;
}
return Progress;
}
/// Check if DstReg is a terminal node.
/// I.e., it does not have any affinity other than \p Copy.
static bool isTerminalReg(unsigned DstReg, const MachineInstr &Copy,
const MachineRegisterInfo *MRI) {
assert(Copy.isCopyLike());
// Check if the destination of this copy as any other affinity.
for (const MachineInstr &MI : MRI->reg_nodbg_instructions(DstReg))
if (&MI != &Copy && MI.isCopyLike())
return false;
return true;
}
bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
assert(Copy.isCopyLike());
if (!UseTerminalRule)
return false;
unsigned DstReg, DstSubReg, SrcReg, SrcSubReg;
isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg);
// Check if the destination of this copy has any other affinity.
if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
// If SrcReg is a physical register, the copy won't be coalesced.
// Ignoring it may have other side effect (like missing
// rematerialization). So keep it.
TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
!isTerminalReg(DstReg, Copy, MRI))
return false;
// DstReg is a terminal node. Check if it interferes with any other
// copy involving SrcReg.
const MachineBasicBlock *OrigBB = Copy.getParent();
const LiveInterval &DstLI = LIS->getInterval(DstReg);
for (const MachineInstr &MI : MRI->reg_nodbg_instructions(SrcReg)) {
// Technically we should check if the weight of the new copy is
// interesting compared to the other one and update the weight
// of the copies accordingly. However, this would only work if
// we would gather all the copies first then coalesce, whereas
// right now we interleave both actions.
// For now, just consider the copies that are in the same block.
if (&MI == &Copy || !MI.isCopyLike() || MI.getParent() != OrigBB)
continue;
unsigned OtherReg, OtherSubReg, OtherSrcReg, OtherSrcSubReg;
isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
OtherSubReg);
if (OtherReg == SrcReg)
OtherReg = OtherSrcReg;
// Check if OtherReg is a non-terminal.
if (TargetRegisterInfo::isPhysicalRegister(OtherReg) ||
isTerminalReg(OtherReg, MI, MRI))
continue;
// Check that OtherReg interfere with DstReg.
if (LIS->getInterval(OtherReg).overlaps(DstLI)) {
DEBUG(dbgs() << "Apply terminal rule for: " << PrintReg(DstReg) << '\n');
return true;
}
}
return false;
}
void
RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
DEBUG(dbgs() << MBB->getName() << ":\n");
// Collect all copy-like instructions in MBB. Don't start coalescing anything
// yet, it might invalidate the iterator.
const unsigned PrevSize = WorkList.size();
if (JoinGlobalCopies) {
SmallVector<MachineInstr*, 2> LocalTerminals;
SmallVector<MachineInstr*, 2> GlobalTerminals;
// Coalesce copies bottom-up to coalesce local defs before local uses. They
// are not inherently easier to resolve, but slightly preferable until we
// have local live range splitting. In particular this is required by
// cmp+jmp macro fusion.
for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
MII != E; ++MII) {
if (!MII->isCopyLike())
continue;
bool ApplyTerminalRule = applyTerminalRule(*MII);
if (isLocalCopy(&(*MII), LIS)) {
if (ApplyTerminalRule)
LocalTerminals.push_back(&(*MII));
else
LocalWorkList.push_back(&(*MII));
} else {
if (ApplyTerminalRule)
GlobalTerminals.push_back(&(*MII));
else
WorkList.push_back(&(*MII));
}
}
// Append the copies evicted by the terminal rule at the end of the list.
LocalWorkList.append(LocalTerminals.begin(), LocalTerminals.end());
WorkList.append(GlobalTerminals.begin(), GlobalTerminals.end());
}
else {
SmallVector<MachineInstr*, 2> Terminals;
for (MachineInstr &MII : *MBB)
if (MII.isCopyLike()) {
if (applyTerminalRule(MII))
Terminals.push_back(&MII);
else
WorkList.push_back(&MII);
}
// Append the copies evicted by the terminal rule at the end of the list.
WorkList.append(Terminals.begin(), Terminals.end());
}
// Try coalescing the collected copies immediately, and remove the nulls.
// This prevents the WorkList from getting too large since most copies are
// joinable on the first attempt.
MutableArrayRef<MachineInstr*>
CurrList(WorkList.begin() + PrevSize, WorkList.end());
if (copyCoalesceWorkList(CurrList))
WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
(MachineInstr*)nullptr), WorkList.end());
}
void RegisterCoalescer::coalesceLocals() {
copyCoalesceWorkList(LocalWorkList);
for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
if (LocalWorkList[j])
WorkList.push_back(LocalWorkList[j]);
}
LocalWorkList.clear();
}
void RegisterCoalescer::joinAllIntervals() {
DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");
std::vector<MBBPriorityInfo> MBBs;
MBBs.reserve(MF->size());
for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
MachineBasicBlock *MBB = &*I;
MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
JoinSplitEdges && isSplitEdge(MBB)));
}
array_pod_sort(MBBs.begin(), MBBs.end(), compareMBBPriority);
// Coalesce intervals in MBB priority order.
unsigned CurrDepth = UINT_MAX;
for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
// Try coalescing the collected local copies for deeper loops.
if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) {
coalesceLocals();
CurrDepth = MBBs[i].Depth;
}
copyCoalesceInMBB(MBBs[i].MBB);
}
coalesceLocals();
// Joining intervals can allow other intervals to be joined. Iteratively join
// until we make no progress.
while (copyCoalesceWorkList(WorkList))
/* empty */ ;
}
void RegisterCoalescer::releaseMemory() {
ErasedInstrs.clear();
WorkList.clear();
DeadDefs.clear();
InflateRegs.clear();
}
bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
MF = &fn;
MRI = &fn.getRegInfo();
TM = &fn.getTarget();
const TargetSubtargetInfo &STI = fn.getSubtarget();
TRI = STI.getRegisterInfo();
TII = STI.getInstrInfo();
LIS = &getAnalysis<LiveIntervals>();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
Loops = &getAnalysis<MachineLoopInfo>();
if (EnableGlobalCopies == cl::BOU_UNSET)
JoinGlobalCopies = STI.enableJoinGlobalCopies();
else
JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
// The MachineScheduler does not currently require JoinSplitEdges. This will
// either be enabled unconditionally or replaced by a more general live range
// splitting optimization.
JoinSplitEdges = EnableJoinSplits;
DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
<< "********** Function: " << MF->getName() << '\n');
if (VerifyCoalescing)
MF->verify(this, "Before register coalescing");
RegClassInfo.runOnMachineFunction(fn);
// Join (coalesce) intervals if requested.
if (EnableJoining)
joinAllIntervals();
// After deleting a lot of copies, register classes may be less constrained.
// Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
// DPR inflation.
array_pod_sort(InflateRegs.begin(), InflateRegs.end());
InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
InflateRegs.end());
DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
unsigned Reg = InflateRegs[i];
if (MRI->reg_nodbg_empty(Reg))
continue;
if (MRI->recomputeRegClass(Reg)) {
DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
<< TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
++NumInflated;
LiveInterval &LI = LIS->getInterval(Reg);
if (LI.hasSubRanges()) {
// If the inflated register class does not support subregisters anymore
// remove the subranges.
if (!MRI->shouldTrackSubRegLiveness(Reg)) {
LI.clearSubRanges();
} else {
#ifndef NDEBUG
LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
// If subranges are still supported, then the same subregs
// should still be supported.
for (LiveInterval::SubRange &S : LI.subranges()) {
assert((S.LaneMask & ~MaxMask).none());
}
#endif
}
}
}
}
DEBUG(dump());
if (VerifyCoalescing)
MF->verify(this, "After register coalescing");
return true;
}
void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
LIS->print(O, m);
}
Index: projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/MC/MCCodeView.cpp (revision 313643)
@@ -1,572 +1,572 @@
//===- MCCodeView.h - Machine Code CodeView support -------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Holds state from .cv_file and .cv_loc directives for later emission.
//
//===----------------------------------------------------------------------===//
#include "llvm/MC/MCCodeView.h"
#include "llvm/MC/MCAsmLayout.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/DebugInfo/CodeView/Line.h"
#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/COFF.h"
#include "llvm/Support/EndianStream.h"
using namespace llvm;
using namespace llvm::codeview;
CodeViewContext::CodeViewContext() {}
CodeViewContext::~CodeViewContext() {
// If someone inserted strings into the string table but never actually
// emitted them somewhere, clean up the fragment.
if (!InsertedStrTabFragment)
delete StrTabFragment;
}
/// This is a valid number for use with .cv_loc if we've already seen a .cv_file
/// for it.
bool CodeViewContext::isValidFileNumber(unsigned FileNumber) const {
unsigned Idx = FileNumber - 1;
if (Idx < Filenames.size())
return !Filenames[Idx].empty();
return false;
}
bool CodeViewContext::addFile(unsigned FileNumber, StringRef Filename) {
assert(FileNumber > 0);
Filename = addToStringTable(Filename);
unsigned Idx = FileNumber - 1;
if (Idx >= Filenames.size())
Filenames.resize(Idx + 1);
if (Filename.empty())
Filename = "<stdin>";
if (!Filenames[Idx].empty())
return false;
// FIXME: We should store the string table offset of the filename, rather than
// the filename itself for efficiency.
Filename = addToStringTable(Filename);
Filenames[Idx] = Filename;
return true;
}
bool CodeViewContext::recordFunctionId(unsigned FuncId) {
if (FuncId >= Functions.size())
Functions.resize(FuncId + 1);
// Return false if this function info was already allocated.
if (!Functions[FuncId].isUnallocatedFunctionInfo())
return false;
// Mark this as an allocated normal function, and leave the rest alone.
Functions[FuncId].ParentFuncIdPlusOne = MCCVFunctionInfo::FunctionSentinel;
return true;
}
bool CodeViewContext::recordInlinedCallSiteId(unsigned FuncId, unsigned IAFunc,
unsigned IAFile, unsigned IALine,
unsigned IACol) {
if (FuncId >= Functions.size())
Functions.resize(FuncId + 1);
// Return false if this function info was already allocated.
if (!Functions[FuncId].isUnallocatedFunctionInfo())
return false;
MCCVFunctionInfo::LineInfo InlinedAt;
InlinedAt.File = IAFile;
InlinedAt.Line = IALine;
InlinedAt.Col = IACol;
// Mark this as an inlined call site and record call site line info.
MCCVFunctionInfo *Info = &Functions[FuncId];
Info->ParentFuncIdPlusOne = IAFunc + 1;
Info->InlinedAt = InlinedAt;
// Walk up the call chain adding this function id to the InlinedAtMap of all
// transitive callers until we hit a real function.
while (Info->isInlinedCallSite()) {
InlinedAt = Info->InlinedAt;
Info = getCVFunctionInfo(Info->getParentFuncId());
Info->InlinedAtMap[FuncId] = InlinedAt;
}
return true;
}
MCDataFragment *CodeViewContext::getStringTableFragment() {
if (!StrTabFragment) {
StrTabFragment = new MCDataFragment();
// Start a new string table out with a null byte.
StrTabFragment->getContents().push_back('\0');
}
return StrTabFragment;
}
StringRef CodeViewContext::addToStringTable(StringRef S) {
SmallVectorImpl<char> &Contents = getStringTableFragment()->getContents();
auto Insertion =
StringTable.insert(std::make_pair(S, unsigned(Contents.size())));
// Return the string from the table, since it is stable.
S = Insertion.first->first();
if (Insertion.second) {
// The string map key is always null terminated.
Contents.append(S.begin(), S.end() + 1);
}
return S;
}
unsigned CodeViewContext::getStringTableOffset(StringRef S) {
// A string table offset of zero is always the empty string.
if (S.empty())
return 0;
auto I = StringTable.find(S);
assert(I != StringTable.end());
return I->second;
}
void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
MCContext &Ctx = OS.getContext();
MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
*StringEnd = Ctx.createTempSymbol("strtab_end", false);
OS.EmitIntValue(unsigned(ModuleSubstreamKind::StringTable), 4);
OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
OS.EmitLabel(StringBegin);
// Put the string table data fragment here, if we haven't already put it
// somewhere else. If somebody wants two string tables in their .s file, one
// will just be empty.
if (!InsertedStrTabFragment) {
OS.insert(getStringTableFragment());
InsertedStrTabFragment = true;
}
OS.EmitValueToAlignment(4, 0);
OS.EmitLabel(StringEnd);
}
void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
// Do nothing if there are no file checksums. Microsoft's linker rejects empty
// CodeView substreams.
if (Filenames.empty())
return;
MCContext &Ctx = OS.getContext();
MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
*FileEnd = Ctx.createTempSymbol("filechecksums_end", false);
OS.EmitIntValue(unsigned(ModuleSubstreamKind::FileChecksums), 4);
OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
OS.EmitLabel(FileBegin);
// Emit an array of FileChecksum entries. We index into this table using the
// user-provided file number. Each entry is currently 8 bytes, as we don't
// emit checksums.
for (StringRef Filename : Filenames) {
OS.EmitIntValue(getStringTableOffset(Filename), 4);
// Zero the next two fields and align back to 4 bytes. This indicates that
// no checksum is present.
OS.EmitIntValue(0, 4);
}
OS.EmitLabel(FileEnd);
}
void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
unsigned FuncId,
const MCSymbol *FuncBegin,
const MCSymbol *FuncEnd) {
MCContext &Ctx = OS.getContext();
MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
*LineEnd = Ctx.createTempSymbol("linetable_end", false);
OS.EmitIntValue(unsigned(ModuleSubstreamKind::Lines), 4);
OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
OS.EmitLabel(LineBegin);
OS.EmitCOFFSecRel32(FuncBegin, /*Offset=*/0);
OS.EmitCOFFSectionIndex(FuncBegin);
// Actual line info.
std::vector<MCCVLineEntry> Locs = getFunctionLineEntries(FuncId);
bool HaveColumns = any_of(Locs, [](const MCCVLineEntry &LineEntry) {
return LineEntry.getColumn() != 0;
});
OS.EmitIntValue(HaveColumns ? int(LineFlags::HaveColumns) : 0, 2);
OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);
for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
// Emit a file segment for the run of locations that share a file id.
unsigned CurFileNum = I->getFileNum();
auto FileSegEnd =
std::find_if(I, E, [CurFileNum](const MCCVLineEntry &Loc) {
return Loc.getFileNum() != CurFileNum;
});
unsigned EntryCount = FileSegEnd - I;
OS.AddComment("Segment for file '" + Twine(Filenames[CurFileNum - 1]) +
"' begins");
OS.EmitIntValue(8 * (CurFileNum - 1), 4);
OS.EmitIntValue(EntryCount, 4);
uint32_t SegmentSize = 12;
SegmentSize += 8 * EntryCount;
if (HaveColumns)
SegmentSize += 4 * EntryCount;
OS.EmitIntValue(SegmentSize, 4);
for (auto J = I; J != FileSegEnd; ++J) {
OS.emitAbsoluteSymbolDiff(J->getLabel(), FuncBegin, 4);
unsigned LineData = J->getLine();
if (J->isStmt())
LineData |= LineInfo::StatementFlag;
OS.EmitIntValue(LineData, 4);
}
if (HaveColumns) {
for (auto J = I; J != FileSegEnd; ++J) {
OS.EmitIntValue(J->getColumn(), 2);
OS.EmitIntValue(0, 2);
}
}
I = FileSegEnd;
}
OS.EmitLabel(LineEnd);
}
static bool compressAnnotation(uint32_t Data, SmallVectorImpl<char> &Buffer) {
if (isUInt<7>(Data)) {
Buffer.push_back(Data);
return true;
}
if (isUInt<14>(Data)) {
Buffer.push_back((Data >> 8) | 0x80);
Buffer.push_back(Data & 0xff);
return true;
}
if (isUInt<29>(Data)) {
Buffer.push_back((Data >> 24) | 0xC0);
Buffer.push_back((Data >> 16) & 0xff);
Buffer.push_back((Data >> 8) & 0xff);
Buffer.push_back(Data & 0xff);
return true;
}
return false;
}
static bool compressAnnotation(BinaryAnnotationsOpCode Annotation,
SmallVectorImpl<char> &Buffer) {
return compressAnnotation(static_cast<uint32_t>(Annotation), Buffer);
}
static uint32_t encodeSignedNumber(uint32_t Data) {
if (Data >> 31)
return ((-Data) << 1) | 1;
return Data << 1;
}
void CodeViewContext::emitInlineLineTableForFunction(MCObjectStreamer &OS,
unsigned PrimaryFunctionId,
unsigned SourceFileId,
unsigned SourceLineNum,
const MCSymbol *FnStartSym,
const MCSymbol *FnEndSym) {
// Create and insert a fragment into the current section that will be encoded
// later.
new MCCVInlineLineTableFragment(PrimaryFunctionId, SourceFileId,
SourceLineNum, FnStartSym, FnEndSym,
OS.getCurrentSectionOnly());
}
void CodeViewContext::emitDefRange(
MCObjectStreamer &OS,
ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
StringRef FixedSizePortion) {
// Create and insert a fragment into the current section that will be encoded
// later.
new MCCVDefRangeFragment(Ranges, FixedSizePortion,
OS.getCurrentSectionOnly());
}
static unsigned computeLabelDiff(MCAsmLayout &Layout, const MCSymbol *Begin,
const MCSymbol *End) {
MCContext &Ctx = Layout.getAssembler().getContext();
MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
const MCExpr *BeginRef = MCSymbolRefExpr::create(Begin, Variant, Ctx),
*EndRef = MCSymbolRefExpr::create(End, Variant, Ctx);
const MCExpr *AddrDelta =
MCBinaryExpr::create(MCBinaryExpr::Sub, EndRef, BeginRef, Ctx);
int64_t Result;
bool Success = AddrDelta->evaluateKnownAbsolute(Result, Layout);
assert(Success && "failed to evaluate label difference as absolute");
(void)Success;
assert(Result >= 0 && "negative label difference requested");
assert(Result < UINT_MAX && "label difference greater than 2GB");
return unsigned(Result);
}
void CodeViewContext::encodeInlineLineTable(MCAsmLayout &Layout,
MCCVInlineLineTableFragment &Frag) {
size_t LocBegin;
size_t LocEnd;
std::tie(LocBegin, LocEnd) = getLineExtent(Frag.SiteFuncId);
// Include all child inline call sites in our .cv_loc extent.
MCCVFunctionInfo *SiteInfo = getCVFunctionInfo(Frag.SiteFuncId);
for (auto &KV : SiteInfo->InlinedAtMap) {
unsigned ChildId = KV.first;
auto Extent = getLineExtent(ChildId);
LocBegin = std::min(LocBegin, Extent.first);
LocEnd = std::max(LocEnd, Extent.second);
}
if (LocBegin >= LocEnd)
return;
ArrayRef<MCCVLineEntry> Locs = getLinesForExtent(LocBegin, LocEnd);
if (Locs.empty())
return;
// Make an artificial start location using the function start and the inlinee
// lines start location information. All deltas start relative to this
// location.
MCCVLineEntry StartLoc(Frag.getFnStartSym(), MCCVLoc(Locs.front()));
StartLoc.setFileNum(Frag.StartFileId);
StartLoc.setLine(Frag.StartLineNum);
bool HaveOpenRange = false;
const MCSymbol *LastLabel = Frag.getFnStartSym();
MCCVFunctionInfo::LineInfo LastSourceLoc, CurSourceLoc;
LastSourceLoc.File = Frag.StartFileId;
LastSourceLoc.Line = Frag.StartLineNum;
SmallVectorImpl<char> &Buffer = Frag.getContents();
Buffer.clear(); // Clear old contents if we went through relaxation.
for (const MCCVLineEntry &Loc : Locs) {
// Exit early if our line table would produce an oversized InlineSiteSym
// record. Account for the ChangeCodeLength annotation emitted after the
// loop ends.
constexpr uint32_t InlineSiteSize = 12;
constexpr uint32_t AnnotationSize = 8;
size_t MaxBufferSize = MaxRecordLength - InlineSiteSize - AnnotationSize;
if (Buffer.size() >= MaxBufferSize)
break;
if (Loc.getFunctionId() == Frag.SiteFuncId) {
CurSourceLoc.File = Loc.getFileNum();
CurSourceLoc.Line = Loc.getLine();
} else {
auto I = SiteInfo->InlinedAtMap.find(Loc.getFunctionId());
if (I != SiteInfo->InlinedAtMap.end()) {
// This .cv_loc is from a child inline call site. Use the source
// location of the inlined call site instead of the .cv_loc directive
// source location.
CurSourceLoc = I->second;
} else {
// We've hit a cv_loc not attributed to this inline call site. Use this
// label to end the PC range.
if (HaveOpenRange) {
unsigned Length = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
compressAnnotation(Length, Buffer);
LastLabel = Loc.getLabel();
}
HaveOpenRange = false;
continue;
}
}
// Skip this .cv_loc if we have an open range and this isn't a meaningful
// source location update. The current table format does not support column
// info, so we can skip updates for those.
if (HaveOpenRange && CurSourceLoc.File == LastSourceLoc.File &&
CurSourceLoc.Line == LastSourceLoc.Line)
continue;
HaveOpenRange = true;
if (CurSourceLoc.File != LastSourceLoc.File) {
// File ids are 1 based, and each file checksum table entry is 8 bytes
// long. See emitFileChecksums above.
unsigned FileOffset = 8 * (CurSourceLoc.File - 1);
compressAnnotation(BinaryAnnotationsOpCode::ChangeFile, Buffer);
compressAnnotation(FileOffset, Buffer);
}
int LineDelta = CurSourceLoc.Line - LastSourceLoc.Line;
unsigned EncodedLineDelta = encodeSignedNumber(LineDelta);
unsigned CodeDelta = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
if (CodeDelta == 0 && LineDelta != 0) {
compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
compressAnnotation(EncodedLineDelta, Buffer);
} else if (EncodedLineDelta < 0x8 && CodeDelta <= 0xf) {
// The ChangeCodeOffsetAndLineOffset combination opcode is used when the
// encoded line delta uses 3 or fewer set bits and the code offset fits
// in one nibble.
unsigned Operand = (EncodedLineDelta << 4) | CodeDelta;
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffsetAndLineOffset,
Buffer);
compressAnnotation(Operand, Buffer);
} else {
// Otherwise use the separate line and code deltas.
if (LineDelta != 0) {
compressAnnotation(BinaryAnnotationsOpCode::ChangeLineOffset, Buffer);
compressAnnotation(EncodedLineDelta, Buffer);
}
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeOffset, Buffer);
compressAnnotation(CodeDelta, Buffer);
}
LastLabel = Loc.getLabel();
LastSourceLoc = CurSourceLoc;
}
assert(HaveOpenRange);
unsigned EndSymLength =
computeLabelDiff(Layout, LastLabel, Frag.getFnEndSym());
unsigned LocAfterLength = ~0U;
ArrayRef<MCCVLineEntry> LocAfter = getLinesForExtent(LocEnd, LocEnd + 1);
if (!LocAfter.empty()) {
// Only try to compute this difference if we're in the same section.
const MCCVLineEntry &Loc = LocAfter[0];
if (&Loc.getLabel()->getSection(false) == &LastLabel->getSection(false))
LocAfterLength = computeLabelDiff(Layout, LastLabel, Loc.getLabel());
}
compressAnnotation(BinaryAnnotationsOpCode::ChangeCodeLength, Buffer);
compressAnnotation(std::min(EndSymLength, LocAfterLength), Buffer);
}
void CodeViewContext::encodeDefRange(MCAsmLayout &Layout,
MCCVDefRangeFragment &Frag) {
MCContext &Ctx = Layout.getAssembler().getContext();
SmallVectorImpl<char> &Contents = Frag.getContents();
Contents.clear();
SmallVectorImpl<MCFixup> &Fixups = Frag.getFixups();
Fixups.clear();
raw_svector_ostream OS(Contents);
// Compute all the sizes up front.
SmallVector<std::pair<unsigned, unsigned>, 4> GapAndRangeSizes;
const MCSymbol *LastLabel = nullptr;
for (std::pair<const MCSymbol *, const MCSymbol *> Range : Frag.getRanges()) {
unsigned GapSize =
LastLabel ? computeLabelDiff(Layout, LastLabel, Range.first) : 0;
unsigned RangeSize = computeLabelDiff(Layout, Range.first, Range.second);
GapAndRangeSizes.push_back({GapSize, RangeSize});
LastLabel = Range.second;
}
// Write down each range where the variable is defined.
for (size_t I = 0, E = Frag.getRanges().size(); I != E;) {
// If the range size of multiple consecutive ranges is under the max,
// combine the ranges and emit some gaps.
const MCSymbol *RangeBegin = Frag.getRanges()[I].first;
unsigned RangeSize = GapAndRangeSizes[I].second;
size_t J = I + 1;
for (; J != E; ++J) {
unsigned GapAndRangeSize = GapAndRangeSizes[J].first + GapAndRangeSizes[J].second;
if (RangeSize + GapAndRangeSize > MaxDefRange)
break;
RangeSize += GapAndRangeSize;
}
unsigned NumGaps = J - I - 1;
support::endian::Writer<support::little> LEWriter(OS);
unsigned Bias = 0;
// We must split the range into chunks of MaxDefRange, this is a fundamental
// limitation of the file format.
do {
uint16_t Chunk = std::min((uint32_t)MaxDefRange, RangeSize);
const MCSymbolRefExpr *SRE = MCSymbolRefExpr::create(RangeBegin, Ctx);
const MCBinaryExpr *BE =
MCBinaryExpr::createAdd(SRE, MCConstantExpr::create(Bias, Ctx), Ctx);
MCValue Res;
BE->evaluateAsRelocatable(Res, &Layout, /*Fixup=*/nullptr);
// Each record begins with a 2-byte number indicating how large the record
// is.
StringRef FixedSizePortion = Frag.getFixedSizePortion();
// Our record is a fixed sized prefix and a LocalVariableAddrRange that we
// are artificially constructing.
size_t RecordSize = FixedSizePortion.size() +
sizeof(LocalVariableAddrRange) + 4 * NumGaps;
- // Write out the recrod size.
- support::endian::Writer<support::little>(OS).write<uint16_t>(RecordSize);
+ // Write out the record size.
+ LEWriter.write<uint16_t>(RecordSize);
// Write out the fixed size prefix.
OS << FixedSizePortion;
// Make space for a fixup that will eventually have a section relative
// relocation pointing at the offset where the variable becomes live.
Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_4));
- Contents.resize(Contents.size() + 4); // Fixup for code start.
+ LEWriter.write<uint32_t>(0); // Fixup for code start.
// Make space for a fixup that will record the section index for the code.
Fixups.push_back(MCFixup::create(Contents.size(), BE, FK_SecRel_2));
- Contents.resize(Contents.size() + 2); // Fixup for section index.
+ LEWriter.write<uint16_t>(0); // Fixup for section index.
// Write down the range's extent.
LEWriter.write<uint16_t>(Chunk);
// Move on to the next range.
Bias += Chunk;
RangeSize -= Chunk;
} while (RangeSize > 0);
// Emit the gaps afterwards.
- assert((NumGaps == 0 || Bias < MaxDefRange) &&
+ assert((NumGaps == 0 || Bias <= MaxDefRange) &&
"large ranges should not have gaps");
unsigned GapStartOffset = GapAndRangeSizes[I].second;
for (++I; I != J; ++I) {
unsigned GapSize, RangeSize;
assert(I < GapAndRangeSizes.size());
std::tie(GapSize, RangeSize) = GapAndRangeSizes[I];
LEWriter.write<uint16_t>(GapStartOffset);
- LEWriter.write<uint16_t>(RangeSize);
+ LEWriter.write<uint16_t>(GapSize);
GapStartOffset += GapSize + RangeSize;
}
}
}
//
// This is called when an instruction is assembled into the specified section
// and if there is information from the last .cv_loc directive that has yet to have
// a line entry made for it is made.
//
void MCCVLineEntry::Make(MCObjectStreamer *MCOS) {
CodeViewContext &CVC = MCOS->getContext().getCVContext();
if (!CVC.getCVLocSeen())
return;
// Create a symbol at in the current section for use in the line entry.
MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
// Set the value of the symbol to use for the MCCVLineEntry.
MCOS->EmitLabel(LineSym);
// Get the current .loc info saved in the context.
const MCCVLoc &CVLoc = CVC.getCurrentCVLoc();
// Create a (local) line entry with the symbol and the current .loc info.
MCCVLineEntry LineEntry(LineSym, CVLoc);
// clear CVLocSeen saying the current .loc info is now used.
CVC.clearCVLocSeen();
// Add the line entry to this section's entries.
CVC.addLineEntry(LineEntry);
}
Index: projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 313643)
@@ -1,10714 +1,10715 @@
//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the AArch64TargetLowering class.
//
//===----------------------------------------------------------------------===//
#include "AArch64CallingConvention.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64ISelLowering.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Triple.h"
#include "llvm/ADT/Twine.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/RuntimeLibcalls.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetCallingConv.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
#include <cassert>
#include <cctype>
#include <cstdint>
#include <cstdlib>
#include <iterator>
#include <limits>
#include <tuple>
#include <utility>
#include <vector>
using namespace llvm;
#define DEBUG_TYPE "aarch64-lower"
STATISTIC(NumTailCalls, "Number of tail calls");
STATISTIC(NumShiftInserts, "Number of vector shift inserts");
static cl::opt<bool>
EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
cl::desc("Allow AArch64 SLI/SRI formation"),
cl::init(false));
// FIXME: The necessary dtprel relocations don't seem to be supported
// well in the GNU bfd and gold linkers at the moment. Therefore, by
// default, for now, fall back to GeneralDynamic code generation.
cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
"aarch64-elf-ldtls-generation", cl::Hidden,
cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
cl::init(false));
/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;
AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
const AArch64Subtarget &STI)
: TargetLowering(TM), Subtarget(&STI) {
// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
// we have to make something up. Arbitrarily, choose ZeroOrOne.
setBooleanContents(ZeroOrOneBooleanContent);
// When comparing vectors the result sets the different elements in the
// vector to all-one or all-zero.
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
// Set up the register classes.
addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
if (Subtarget->hasFPARMv8()) {
addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
}
if (Subtarget->hasNEON()) {
addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
// Someone set us up the NEON.
addDRTypeForNEON(MVT::v2f32);
addDRTypeForNEON(MVT::v8i8);
addDRTypeForNEON(MVT::v4i16);
addDRTypeForNEON(MVT::v2i32);
addDRTypeForNEON(MVT::v1i64);
addDRTypeForNEON(MVT::v1f64);
addDRTypeForNEON(MVT::v4f16);
addQRTypeForNEON(MVT::v4f32);
addQRTypeForNEON(MVT::v2f64);
addQRTypeForNEON(MVT::v16i8);
addQRTypeForNEON(MVT::v8i16);
addQRTypeForNEON(MVT::v4i32);
addQRTypeForNEON(MVT::v2i64);
addQRTypeForNEON(MVT::v8f16);
}
// Compute derived properties from the register classes
computeRegisterProperties(Subtarget->getRegisterInfo());
// Provide all sorts of operation actions
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::i32, Custom);
setOperationAction(ISD::SETCC, MVT::i64, Custom);
setOperationAction(ISD::SETCC, MVT::f32, Custom);
setOperationAction(ISD::SETCC, MVT::f64, Custom);
setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Custom);
setOperationAction(ISD::BR_CC, MVT::i64, Custom);
setOperationAction(ISD::BR_CC, MVT::f32, Custom);
setOperationAction(ISD::BR_CC, MVT::f64, Custom);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::i64, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
setOperationAction(ISD::SELECT, MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
setOperationAction(ISD::FREM, MVT::f32, Expand);
setOperationAction(ISD::FREM, MVT::f64, Expand);
setOperationAction(ISD::FREM, MVT::f80, Expand);
// Custom lowering hooks are needed for XOR
// to fold it into CSINC/CSINV.
setOperationAction(ISD::XOR, MVT::i32, Custom);
setOperationAction(ISD::XOR, MVT::i64, Custom);
// Virtually no operation on f128 is legal, but LLVM can't expand them when
// there's a valid register class, so we need custom operations in most cases.
setOperationAction(ISD::FABS, MVT::f128, Expand);
setOperationAction(ISD::FADD, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
setOperationAction(ISD::FCOS, MVT::f128, Expand);
setOperationAction(ISD::FDIV, MVT::f128, Custom);
setOperationAction(ISD::FMA, MVT::f128, Expand);
setOperationAction(ISD::FMUL, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Expand);
setOperationAction(ISD::FPOW, MVT::f128, Expand);
setOperationAction(ISD::FREM, MVT::f128, Expand);
setOperationAction(ISD::FRINT, MVT::f128, Expand);
setOperationAction(ISD::FSIN, MVT::f128, Expand);
setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
setOperationAction(ISD::FSQRT, MVT::f128, Expand);
setOperationAction(ISD::FSUB, MVT::f128, Custom);
setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
setOperationAction(ISD::SETCC, MVT::f128, Custom);
setOperationAction(ISD::BR_CC, MVT::f128, Custom);
setOperationAction(ISD::SELECT, MVT::f128, Custom);
setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
// Lowering for many of the conversions is actually specified by the non-f128
// type. The LowerXXX function will be trivial when f128 isn't involved.
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
// Variable arguments.
setOperationAction(ISD::VASTART, MVT::Other, Custom);
setOperationAction(ISD::VAARG, MVT::Other, Custom);
setOperationAction(ISD::VACOPY, MVT::Other, Custom);
setOperationAction(ISD::VAEND, MVT::Other, Expand);
// Variable-sized objects.
setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
// Constant pool entries
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
// BlockAddress
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
setOperationAction(ISD::ADDC, MVT::i32, Custom);
setOperationAction(ISD::ADDE, MVT::i32, Custom);
setOperationAction(ISD::SUBC, MVT::i32, Custom);
setOperationAction(ISD::SUBE, MVT::i32, Custom);
setOperationAction(ISD::ADDC, MVT::i64, Custom);
setOperationAction(ISD::ADDE, MVT::i64, Custom);
setOperationAction(ISD::SUBC, MVT::i64, Custom);
setOperationAction(ISD::SUBE, MVT::i64, Custom);
// AArch64 lacks both left-rotate and popcount instructions.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::ROTL, VT, Expand);
setOperationAction(ISD::ROTR, VT, Expand);
}
// AArch64 doesn't have {U|S}MUL_LOHI.
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
setOperationAction(ISD::CTPOP, MVT::i32, Custom);
setOperationAction(ISD::CTPOP, MVT::i64, Custom);
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
}
setOperationAction(ISD::SREM, MVT::i32, Expand);
setOperationAction(ISD::SREM, MVT::i64, Expand);
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
setOperationAction(ISD::UREM, MVT::i32, Expand);
setOperationAction(ISD::UREM, MVT::i64, Expand);
// Custom lower Add/Sub/Mul with overflow.
setOperationAction(ISD::SADDO, MVT::i32, Custom);
setOperationAction(ISD::SADDO, MVT::i64, Custom);
setOperationAction(ISD::UADDO, MVT::i32, Custom);
setOperationAction(ISD::UADDO, MVT::i64, Custom);
setOperationAction(ISD::SSUBO, MVT::i32, Custom);
setOperationAction(ISD::SSUBO, MVT::i64, Custom);
setOperationAction(ISD::USUBO, MVT::i32, Custom);
setOperationAction(ISD::USUBO, MVT::i64, Custom);
setOperationAction(ISD::SMULO, MVT::i32, Custom);
setOperationAction(ISD::SMULO, MVT::i64, Custom);
setOperationAction(ISD::UMULO, MVT::i32, Custom);
setOperationAction(ISD::UMULO, MVT::i64, Custom);
setOperationAction(ISD::FSIN, MVT::f32, Expand);
setOperationAction(ISD::FSIN, MVT::f64, Expand);
setOperationAction(ISD::FCOS, MVT::f32, Expand);
setOperationAction(ISD::FCOS, MVT::f64, Expand);
setOperationAction(ISD::FPOW, MVT::f32, Expand);
setOperationAction(ISD::FPOW, MVT::f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// f16 is a storage-only type, always promote it to f32.
setOperationAction(ISD::SETCC, MVT::f16, Promote);
setOperationAction(ISD::BR_CC, MVT::f16, Promote);
setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
setOperationAction(ISD::SELECT, MVT::f16, Promote);
setOperationAction(ISD::FADD, MVT::f16, Promote);
setOperationAction(ISD::FSUB, MVT::f16, Promote);
setOperationAction(ISD::FMUL, MVT::f16, Promote);
setOperationAction(ISD::FDIV, MVT::f16, Promote);
setOperationAction(ISD::FREM, MVT::f16, Promote);
setOperationAction(ISD::FMA, MVT::f16, Promote);
setOperationAction(ISD::FNEG, MVT::f16, Promote);
setOperationAction(ISD::FABS, MVT::f16, Promote);
setOperationAction(ISD::FCEIL, MVT::f16, Promote);
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FPOWI, MVT::f16, Promote);
setOperationAction(ISD::FRINT, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
setOperationAction(ISD::FSQRT, MVT::f16, Promote);
setOperationAction(ISD::FEXP, MVT::f16, Promote);
setOperationAction(ISD::FEXP2, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Promote);
setOperationAction(ISD::FLOG2, MVT::f16, Promote);
setOperationAction(ISD::FLOG10, MVT::f16, Promote);
setOperationAction(ISD::FROUND, MVT::f16, Promote);
setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);
// v4f16 is also a storage-only type, so promote it to v4f32 when that is
// known to be safe.
setOperationAction(ISD::FADD, MVT::v4f16, Promote);
setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
// Expand all other v4f16 operations.
// FIXME: We could generate better code by promoting some operations to
// a pair of v4f32s
setOperationAction(ISD::FABS, MVT::v4f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
setOperationAction(ISD::FMA, MVT::v4f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
setOperationAction(ISD::FREM, MVT::v4f16, Expand);
setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);
// v8f16 is also a storage-only type, so expand it.
setOperationAction(ISD::FABS, MVT::v8f16, Expand);
setOperationAction(ISD::FADD, MVT::v8f16, Expand);
setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
setOperationAction(ISD::FMA, MVT::v8f16, Expand);
setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
setOperationAction(ISD::FREM, MVT::v8f16, Expand);
setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::f32, MVT::f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
setOperationAction(ISD::FMINNUM, Ty, Legal);
setOperationAction(ISD::FMAXNUM, Ty, Legal);
setOperationAction(ISD::FMINNAN, Ty, Legal);
setOperationAction(ISD::FMAXNAN, Ty, Legal);
}
setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
if (Subtarget->isTargetMachO()) {
// For iOS, we don't want to the normal expansion of a libcall to
// sincos. We want to issue a libcall to __sincos_stret to avoid memory
// traffic.
setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
} else {
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.
if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
}
// AArch64 does not have floating-point extending loads, i1 sign-extending
// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
for (MVT VT : MVT::fp_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
}
for (MVT VT : MVT::integer_valuetypes())
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f128, MVT::f80, Expand);
setTruncStoreAction(MVT::f128, MVT::f64, Expand);
setTruncStoreAction(MVT::f128, MVT::f32, Expand);
setTruncStoreAction(MVT::f128, MVT::f16, Expand);
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
setOperationAction(ISD::BITCAST, MVT::f16, Custom);
// Indexed loads and stores are supported.
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, MVT::i8, Legal);
setIndexedLoadAction(im, MVT::i16, Legal);
setIndexedLoadAction(im, MVT::i32, Legal);
setIndexedLoadAction(im, MVT::i64, Legal);
setIndexedLoadAction(im, MVT::f64, Legal);
setIndexedLoadAction(im, MVT::f32, Legal);
setIndexedLoadAction(im, MVT::f16, Legal);
setIndexedStoreAction(im, MVT::i8, Legal);
setIndexedStoreAction(im, MVT::i16, Legal);
setIndexedStoreAction(im, MVT::i32, Legal);
setIndexedStoreAction(im, MVT::i64, Legal);
setIndexedStoreAction(im, MVT::f64, Legal);
setIndexedStoreAction(im, MVT::f32, Legal);
setIndexedStoreAction(im, MVT::f16, Legal);
}
// Trap.
setOperationAction(ISD::TRAP, MVT::Other, Legal);
// We combine OR nodes for bitfield operations.
setTargetDAGCombine(ISD::OR);
// Vector add and sub nodes may conceal a high-half opportunity.
// Also, try to fold ADD into CSINC/CSINV..
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::SRL);
setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FP_TO_SINT);
setTargetDAGCombine(ISD::FP_TO_UINT);
setTargetDAGCombine(ISD::FDIV);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
if (Subtarget->supportsAddressTopByteIgnored())
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::SELECT);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::INTRINSIC_VOID);
setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
setStackPointerRegisterToSaveRestore(AArch64::SP);
setSchedulingPreference(Sched::Hybrid);
// Enable TBZ/TBNZ
MaskAndBranchFoldingIsLegal = true;
EnableExtLdPromotion = true;
// Set required alignment.
setMinFunctionAlignment(2);
// Set preferred alignments.
setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
setPrefLoopAlignment(STI.getPrefLoopAlignment());
// Only change the limit for entries in a jump table if specified by
// the subtarget, but not at the command line.
unsigned MaxJT = STI.getMaximumJumpTableSize();
if (MaxJT && getMaximumJumpTableSize() == 0)
setMaximumJumpTableSize(MaxJT);
setHasExtractBitsInsn(true);
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
if (Subtarget->hasNEON()) {
// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
// silliness like this:
setOperationAction(ISD::FABS, MVT::v1f64, Expand);
setOperationAction(ISD::FADD, MVT::v1f64, Expand);
setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
setOperationAction(ISD::FMA, MVT::v1f64, Expand);
setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
setOperationAction(ISD::FREM, MVT::v1f64, Expand);
setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
setOperationAction(ISD::MUL, MVT::v1i64, Expand);
// AArch64 doesn't have a direct vector ->f32 conversion instructions for
// elements smaller than i32, so promote the input to i32 first.
setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
// i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
// -> v8f16 conversions.
setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
// Similarly, there is no direct i32 -> f64 vector conversion instruction.
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);
// AArch64 doesn't have MUL.2d:
setOperationAction(ISD::MUL, MVT::v2i64, Expand);
// Custom handling for some quad-vector types to detect MULL.
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
// Likewise, narrowing and extending vector loads/stores aren't handled
// directly.
for (MVT VT : MVT::vector_valuetypes()) {
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
setOperationAction(ISD::MULHS, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::MULHU, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
setOperationAction(ISD::BSWAP, VT, Expand);
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
}
}
// AArch64 has implementations of a lot of rounding-like FP operations.
for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
setOperationAction(ISD::FFLOOR, Ty, Legal);
setOperationAction(ISD::FNEARBYINT, Ty, Legal);
setOperationAction(ISD::FCEIL, Ty, Legal);
setOperationAction(ISD::FRINT, Ty, Legal);
setOperationAction(ISD::FTRUNC, Ty, Legal);
setOperationAction(ISD::FROUND, Ty, Legal);
}
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
if (VT == MVT::v2f32 || VT == MVT::v4f16) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
} else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) {
setOperationAction(ISD::LOAD, VT, Promote);
AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);
setOperationAction(ISD::STORE, VT, Promote);
AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
}
// Mark vector float intrinsics as expand.
if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
setOperationAction(ISD::FSIN, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
setOperationAction(ISD::FPOWI, VT, Expand);
setOperationAction(ISD::FPOW, VT, Expand);
setOperationAction(ISD::FLOG, VT, Expand);
setOperationAction(ISD::FLOG2, VT, Expand);
setOperationAction(ISD::FLOG10, VT, Expand);
setOperationAction(ISD::FEXP, VT, Expand);
setOperationAction(ISD::FEXP2, VT, Expand);
// But we do support custom-lowering for FCOPYSIGN.
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::SRL, VT, Custom);
setOperationAction(ISD::SHL, VT, Custom);
setOperationAction(ISD::AND, VT, Custom);
setOperationAction(ISD::OR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::SELECT_CC, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
for (MVT InnerVT : MVT::all_valuetypes())
setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
// CNT supports only B element sizes.
if (VT != MVT::v8i8 && VT != MVT::v16i8)
setOperationAction(ISD::CTPOP, VT, Expand);
setOperationAction(ISD::UDIV, VT, Expand);
setOperationAction(ISD::SDIV, VT, Expand);
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SREM, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);
// [SU][MIN|MAX] are available for all NEON types apart from i64.
if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
setOperationAction(Opcode, VT, Legal);
// F[MIN|MAX][NUM|NAN] are available for all FP NEON types (not f16 though!).
if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
ISD::FMINNUM, ISD::FMAXNUM})
setOperationAction(Opcode, VT, Legal);
if (Subtarget->isLittleEndian()) {
for (unsigned im = (unsigned)ISD::PRE_INC;
im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
setIndexedLoadAction(im, VT, Legal);
setIndexedStoreAction(im, VT, Legal);
}
}
}
void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR64RegClass);
addTypeForNEON(VT, MVT::v2i32);
}
void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
addRegisterClass(VT, &AArch64::FPR128RegClass);
addTypeForNEON(VT, MVT::v4i32);
}
EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
EVT VT) const {
if (!VT.isVector())
return MVT::i32;
return VT.changeVectorElementTypeToInteger();
}
/// computeKnownBitsForTargetNode - Determine which of the bits specified in
/// Mask are known to be either zero or one and return them in the
/// KnownZero/KnownOne bitsets.
void AArch64TargetLowering::computeKnownBitsForTargetNode(
const SDValue Op, APInt &KnownZero, APInt &KnownOne,
const SelectionDAG &DAG, unsigned Depth) const {
switch (Op.getOpcode()) {
default:
break;
case AArch64ISD::CSEL: {
APInt KnownZero2, KnownOne2;
DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
KnownZero &= KnownZero2;
KnownOne &= KnownOne2;
break;
}
case ISD::INTRINSIC_W_CHAIN: {
ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
switch (IntID) {
default: return;
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
unsigned BitWidth = KnownOne.getBitWidth();
EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
unsigned MemBits = VT.getScalarSizeInBits();
KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
return;
}
}
break;
}
case ISD::INTRINSIC_WO_CHAIN:
case ISD::INTRINSIC_VOID: {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IntNo) {
default:
break;
case Intrinsic::aarch64_neon_umaxv:
case Intrinsic::aarch64_neon_uminv: {
// Figure out the datatype of the vector operand. The UMINV instruction
// will zero extend the result, so we can mark as known zero all the
// bits larger than the element datatype. 32-bit or larget doesn't need
// this as those are legal types and will be handled by isel directly.
MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
unsigned BitWidth = KnownZero.getBitWidth();
if (VT == MVT::v8i8 || VT == MVT::v16i8) {
assert(BitWidth >= 8 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
KnownZero |= Mask;
} else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
assert(BitWidth >= 16 && "Unexpected width!");
APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
KnownZero |= Mask;
}
break;
} break;
}
}
}
}
MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
EVT) const {
return MVT::i64;
}
bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
unsigned AddrSpace,
unsigned Align,
bool *Fast) const {
if (Subtarget->requiresStrictAlign())
return false;
if (Fast) {
// Some CPUs are fine with unaligned stores except for 128-bit ones.
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
// Code that uses clang vector extensions can mark that it
// wants unaligned accesses to be treated as fast by
// underspecifying alignment to be 1 or 2.
Align <= 2 ||
// Disregard v2i64. Memcpy lowering produces those and splitting
// them regresses performance on micro-benchmarks and olden/bh.
VT == MVT::v2i64;
}
return true;
}
FastISel *
AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo) const {
return AArch64::createFastISel(funcInfo, libInfo);
}
const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((AArch64ISD::NodeType)Opcode) {
case AArch64ISD::FIRST_NUMBER: break;
case AArch64ISD::CALL: return "AArch64ISD::CALL";
case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
case AArch64ISD::ADC: return "AArch64ISD::ADC";
case AArch64ISD::SBC: return "AArch64ISD::SBC";
case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
case AArch64ISD::DUP: return "AArch64ISD::DUP";
case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
case AArch64ISD::BICi: return "AArch64ISD::BICi";
case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
case AArch64ISD::BSL: return "AArch64ISD::BSL";
case AArch64ISD::NEG: return "AArch64ISD::NEG";
case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
case AArch64ISD::REV16: return "AArch64ISD::REV16";
case AArch64ISD::REV32: return "AArch64ISD::REV32";
case AArch64ISD::REV64: return "AArch64ISD::REV64";
case AArch64ISD::EXT: return "AArch64ISD::EXT";
case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
case AArch64ISD::NOT: return "AArch64ISD::NOT";
case AArch64ISD::BIT: return "AArch64ISD::BIT";
case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
}
return nullptr;
}
MachineBasicBlock *
AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
MachineBasicBlock *MBB) const {
// We materialise the F128CSEL pseudo-instruction as some control flow and a
// phi node:
// OrigBB:
// [... previous instrs leading to comparison ...]
// b.ne TrueBB
// b EndBB
// TrueBB:
// ; Fallthrough
// EndBB:
// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
MachineFunction *MF = MBB->getParent();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
const BasicBlock *LLVM_BB = MBB->getBasicBlock();
DebugLoc DL = MI.getDebugLoc();
MachineFunction::iterator It = ++MBB->getIterator();
unsigned DestReg = MI.getOperand(0).getReg();
unsigned IfTrueReg = MI.getOperand(1).getReg();
unsigned IfFalseReg = MI.getOperand(2).getReg();
unsigned CondCode = MI.getOperand(3).getImm();
bool NZCVKilled = MI.getOperand(4).isKill();
MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
MF->insert(It, TrueBB);
MF->insert(It, EndBB);
// Transfer rest of current basic-block to EndBB
EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
MBB->end());
EndBB->transferSuccessorsAndUpdatePHIs(MBB);
BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
MBB->addSuccessor(TrueBB);
MBB->addSuccessor(EndBB);
// TrueBB falls through to the end.
TrueBB->addSuccessor(EndBB);
if (!NZCVKilled) {
TrueBB->addLiveIn(AArch64::NZCV);
EndBB->addLiveIn(AArch64::NZCV);
}
BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
.addReg(IfTrueReg)
.addMBB(TrueBB)
.addReg(IfFalseReg)
.addMBB(MBB);
MI.eraseFromParent();
return EndBB;
}
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
switch (MI.getOpcode()) {
default:
#ifndef NDEBUG
MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
case TargetOpcode::STACKMAP:
case TargetOpcode::PATCHPOINT:
return emitPatchPoint(MI, BB);
}
}
//===----------------------------------------------------------------------===//
// AArch64 Lowering private implementation.
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Lowering Code
//===----------------------------------------------------------------------===//
/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
/// CC
static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
switch (CC) {
default:
llvm_unreachable("Unknown condition code!");
case ISD::SETNE:
return AArch64CC::NE;
case ISD::SETEQ:
return AArch64CC::EQ;
case ISD::SETGT:
return AArch64CC::GT;
case ISD::SETGE:
return AArch64CC::GE;
case ISD::SETLT:
return AArch64CC::LT;
case ISD::SETLE:
return AArch64CC::LE;
case ISD::SETUGT:
return AArch64CC::HI;
case ISD::SETUGE:
return AArch64CC::HS;
case ISD::SETULT:
return AArch64CC::LO;
case ISD::SETULE:
return AArch64CC::LS;
}
}
/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
static void changeFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
default:
llvm_unreachable("Unknown FP condition!");
case ISD::SETEQ:
case ISD::SETOEQ:
CondCode = AArch64CC::EQ;
break;
case ISD::SETGT:
case ISD::SETOGT:
CondCode = AArch64CC::GT;
break;
case ISD::SETGE:
case ISD::SETOGE:
CondCode = AArch64CC::GE;
break;
case ISD::SETOLT:
CondCode = AArch64CC::MI;
break;
case ISD::SETOLE:
CondCode = AArch64CC::LS;
break;
case ISD::SETONE:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GT;
break;
case ISD::SETO:
CondCode = AArch64CC::VC;
break;
case ISD::SETUO:
CondCode = AArch64CC::VS;
break;
case ISD::SETUEQ:
CondCode = AArch64CC::EQ;
CondCode2 = AArch64CC::VS;
break;
case ISD::SETUGT:
CondCode = AArch64CC::HI;
break;
case ISD::SETUGE:
CondCode = AArch64CC::PL;
break;
case ISD::SETLT:
case ISD::SETULT:
CondCode = AArch64CC::LT;
break;
case ISD::SETLE:
case ISD::SETULE:
CondCode = AArch64CC::LE;
break;
case ISD::SETNE:
case ISD::SETUNE:
CondCode = AArch64CC::NE;
break;
}
}
/// Convert a DAG fp condition code to an AArch64 CC.
/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
/// should be AND'ed instead of OR'ed.
static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (CC) {
default:
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
assert(CondCode2 == AArch64CC::AL);
break;
case ISD::SETONE:
// (a one b)
// == ((a olt b) || (a ogt b))
// == ((a ord b) && (a une b))
CondCode = AArch64CC::VC;
CondCode2 = AArch64CC::NE;
break;
case ISD::SETUEQ:
// (a ueq b)
// == ((a uno b) || (a oeq b))
// == ((a ule b) && (a uge b))
CondCode = AArch64CC::PL;
CondCode2 = AArch64CC::LE;
break;
}
}
/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
/// CC usable with the vector instructions. Fewer operations are available
/// without a real NZCV register, so we have to use less efficient combinations
/// to get the same effect.
static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2,
bool &Invert) {
Invert = false;
switch (CC) {
default:
// Mostly the scalar mappings work fine.
changeFPCCToAArch64CC(CC, CondCode, CondCode2);
break;
case ISD::SETUO:
Invert = true;
LLVM_FALLTHROUGH;
case ISD::SETO:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GE;
break;
case ISD::SETUEQ:
case ISD::SETULT:
case ISD::SETULE:
case ISD::SETUGT:
case ISD::SETUGE:
// All of the compare-mask comparisons are ordered, but we can switch
// between the two by a double inversion. E.g. ULE == !OGT.
Invert = true;
changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
break;
}
}
static bool isLegalArithImmed(uint64_t C) {
// Matches AArch64DAGToDAGISel::SelectArithImmed().
return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
}
static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
EVT VT = LHS.getValueType();
if (VT.isFloatingPoint()) {
assert(VT != MVT::f128);
if (VT == MVT::f16) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
VT = MVT::f32;
}
return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
}
// The CMP instruction is just an alias for SUBS, and representing it as
// SUBS means that it's possible to get CSE with subtract operations.
// A later phase can perform the optimization of setting the destination
// register to WZR/XZR if it ends up being unused.
unsigned Opcode = AArch64ISD::SUBS;
if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
// We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
// the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
// can be set differently by this operation. It comes down to whether
// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
// everything is fine. If not then the optimization is wrong. Thus general
// comparisons are only valid if op2 != 0.
// So, finally, the only LLVM-native comparisons that don't mention C and V
// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
// the absence of information about op2.
Opcode = AArch64ISD::ADDS;
RHS = RHS.getOperand(1);
} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
!isUnsignedIntSetCC(CC)) {
// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
// of the signed comparisons.
Opcode = AArch64ISD::ANDS;
RHS = LHS.getOperand(1);
LHS = LHS.getOperand(0);
}
return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
.getValue(1);
}
/// \defgroup AArch64CCMP CMP;CCMP matching
///
/// These functions deal with the formation of CMP;CCMP;... sequences.
/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
/// a comparison. They set the NZCV flags to a predefined value if their
/// predicate is false. This allows to express arbitrary conjunctions, for
/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
/// expressed as:
/// cmp A
/// ccmp B, inv(CB), CA
/// check for CB flags
///
/// In general we can create code for arbitrary "... (and (and A B) C)"
/// sequences. We can also implement some "or" expressions, because "(or A B)"
/// is equivalent to "not (and (not A) (not B))" and we can implement some
/// negation operations:
/// We can negate the results of a single comparison by inverting the flags
/// used when the predicate fails and inverting the flags tested in the next
/// instruction; We can also negate the results of the whole previous
/// conditional compare sequence by inverting the flags tested in the next
/// instruction. However there is no way to negate the result of a partial
/// sequence.
///
/// Therefore on encountering an "or" expression we can negate the subtree on
/// one side and have to be able to push the negate to the leafs of the subtree
/// on the other side (see also the comments in code). As complete example:
/// "or (or (setCA (cmp A)) (setCB (cmp B)))
/// (and (setCC (cmp C)) (setCD (cmp D)))"
/// is transformed to
/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
/// and implemented as:
/// cmp C
/// ccmp D, inv(CD), CC
/// ccmp A, CA, inv(CD)
/// ccmp B, CB, inv(CA)
/// check for CB flags
/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
/// by conditional compare sequences.
/// @{
/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
ISD::CondCode CC, SDValue CCOp,
AArch64CC::CondCode Predicate,
AArch64CC::CondCode OutCC,
const SDLoc &DL, SelectionDAG &DAG) {
unsigned Opcode = 0;
if (LHS.getValueType().isFloatingPoint()) {
assert(LHS.getValueType() != MVT::f128);
if (LHS.getValueType() == MVT::f16) {
LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
}
Opcode = AArch64ISD::FCCMP;
} else if (RHS.getOpcode() == ISD::SUB) {
SDValue SubOp0 = RHS.getOperand(0);
if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
// See emitComparison() on why we can only do this for SETEQ and SETNE.
Opcode = AArch64ISD::CCMN;
RHS = RHS.getOperand(1);
}
}
if (Opcode == 0)
Opcode = AArch64ISD::CCMP;
SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
}
/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
/// CanPushNegate is set to true if we can push a negate operation through
/// the tree in a was that we are left with AND operations and negate operations
/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
/// brought into such a form.
static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
unsigned Depth = 0) {
if (!Val.hasOneUse())
return false;
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
if (Val->getOperand(0).getValueType() == MVT::f128)
return false;
CanNegate = true;
return true;
}
// Protect against exponential runtime and stack overflow.
if (Depth > 6)
return false;
if (Opcode == ISD::AND || Opcode == ISD::OR) {
SDValue O0 = Val->getOperand(0);
SDValue O1 = Val->getOperand(1);
bool CanNegateL;
if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
return false;
bool CanNegateR;
if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
return false;
if (Opcode == ISD::OR) {
// For an OR expression we need to be able to negate at least one side or
// we cannot do the transformation at all.
if (!CanNegateL && !CanNegateR)
return false;
// We can however change a (not (or x y)) to (and (not x) (not y)) if we
// can negate the x and y subtrees.
CanNegate = CanNegateL && CanNegateR;
} else {
// If the operands are OR expressions then we finally need to negate their
// outputs, we can only do that for the operand with emitted last by
// negating OutCC, not for both operands.
bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
if (NeedsNegOutL && NeedsNegOutR)
return false;
// We cannot negate an AND operation (it would become an OR),
CanNegate = false;
}
return true;
}
return false;
}
/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
/// Tries to transform the given i1 producing node @p Val to a series compare
/// and conditional compare operations. @returns an NZCV flags producing node
/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
/// transformation was not possible.
/// On recursive invocations @p PushNegate may be set to true to have negation
/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
/// for the comparisons in the current subtree; @p Depth limits the search
/// depth to avoid stack overflow.
static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
AArch64CC::CondCode Predicate) {
// We're at a tree leaf, produce a conditional comparison operation.
unsigned Opcode = Val->getOpcode();
if (Opcode == ISD::SETCC) {
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
if (Negate)
CC = getSetCCInverse(CC, isInteger);
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
OutCC = changeIntCCToAArch64CC(CC);
} else {
assert(LHS.getValueType().isFloatingPoint());
AArch64CC::CondCode ExtraCC;
changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
// Some floating point conditions can't be tested with a single condition
// code. Construct an additional comparison in this case.
if (ExtraCC != AArch64CC::AL) {
SDValue ExtraCmp;
if (!CCOp.getNode())
ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
else
ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
ExtraCC, DL, DAG);
CCOp = ExtraCmp;
Predicate = ExtraCC;
}
}
// Produce a normal comparison if we are first in the chain
if (!CCOp)
return emitComparison(LHS, RHS, CC, DL, DAG);
// Otherwise produce a ccmp.
return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
DAG);
}
assert((Opcode == ISD::AND || (Opcode == ISD::OR && Val->hasOneUse())) &&
"Valid conjunction/disjunction tree");
// Check if both sides can be transformed.
SDValue LHS = Val->getOperand(0);
SDValue RHS = Val->getOperand(1);
// In case of an OR we need to negate our operands and the result.
// (A v B) <=> not(not(A) ^ not(B))
bool NegateOpsAndResult = Opcode == ISD::OR;
// We can negate the results of all previous operations by inverting the
// predicate flags giving us a free negation for one side. The other side
// must be negatable by itself.
if (NegateOpsAndResult) {
// See which side we can negate.
bool CanNegateL;
bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
assert(isValidL && "Valid conjunction/disjunction tree");
(void)isValidL;
#ifndef NDEBUG
bool CanNegateR;
bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
assert(isValidR && "Valid conjunction/disjunction tree");
assert((CanNegateL || CanNegateR) && "Valid conjunction/disjunction tree");
#endif
// Order the side which we cannot negate to RHS so we can emit it first.
if (!CanNegateL)
std::swap(LHS, RHS);
} else {
bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
assert((!NeedsNegOutL || RHS->getOpcode() != ISD::OR) &&
"Valid conjunction/disjunction tree");
// Order the side where we need to negate the output flags to RHS so it
// gets emitted first.
if (NeedsNegOutL)
std::swap(LHS, RHS);
}
// Emit RHS. If we want to negate the tree we only need to push a negate
// through if we are already in a PushNegate case, otherwise we can negate
// the "flags to test" afterwards.
AArch64CC::CondCode RHSCC;
SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
CCOp, Predicate);
if (NegateOpsAndResult && !Negate)
RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
// Emit LHS. We may need to negate it.
SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
NegateOpsAndResult, CmpR,
RHSCC);
// If we transformed an OR to and AND then we have to negate the result
// (or absorb the Negate parameter).
if (NegateOpsAndResult && !Negate)
OutCC = AArch64CC::getInvertedCondCode(OutCC);
return CmpL;
}
/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
/// \see emitConjunctionDisjunctionTreeRec().
static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
AArch64CC::CondCode &OutCC) {
bool CanNegate;
if (!isConjunctionDisjunctionTree(Val, CanNegate))
return SDValue();
return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
AArch64CC::AL);
}
/// @}
static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
SDValue &AArch64cc, SelectionDAG &DAG,
const SDLoc &dl) {
if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
EVT VT = RHS.getValueType();
uint64_t C = RHSC->getZExtValue();
if (!isLegalArithImmed(C)) {
// Constant does not fit, try adjusting it by one?
switch (CC) {
default:
break;
case ISD::SETLT:
case ISD::SETGE:
if ((VT == MVT::i32 && C != 0x80000000 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0x80000000ULL &&
isLegalArithImmed(C - 1ULL))) {
CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETULT:
case ISD::SETUGE:
if ((VT == MVT::i32 && C != 0 &&
isLegalArithImmed((uint32_t)(C - 1))) ||
(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETLE:
case ISD::SETGT:
if ((VT == MVT::i32 && C != INT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != INT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
case ISD::SETULE:
case ISD::SETUGT:
if ((VT == MVT::i32 && C != UINT32_MAX &&
isLegalArithImmed((uint32_t)(C + 1))) ||
(VT == MVT::i64 && C != UINT64_MAX &&
isLegalArithImmed(C + 1ULL))) {
CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
RHS = DAG.getConstant(C, dl, VT);
}
break;
}
}
}
SDValue Cmp;
AArch64CC::CondCode AArch64CC;
if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
// For the i8 operand, the largest immediate is 255, so this can be easily
// encoded in the compare instruction. For the i16 operand, however, the
// largest immediate cannot be encoded in the compare.
// Therefore, use a sign extending load and cmn to avoid materializing the
// -1 constant. For example,
// movz w1, #65535
// ldrh w0, [x0, #0]
// cmp w0, w1
// >
// ldrsh w0, [x0, #0]
// cmn w0, #1
// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
// if and only if (sext LHS) == (sext RHS). The checks are in place to
// ensure both the LHS and RHS are truly zero extended and to make sure the
// transformation is profitable.
if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
LHS.getNode()->hasNUsesOfValue(1, 0)) {
int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
SDValue SExt =
DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
DAG.getValueType(MVT::i16));
Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
RHS.getValueType()),
CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
}
}
if (!Cmp && (RHSC->isNullValue() || RHSC->isOne())) {
if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
}
}
}
if (!Cmp) {
Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC = changeIntCCToAArch64CC(CC);
}
AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
return Cmp;
}
static std::pair<SDValue, SDValue>
getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
"Unsupported value type");
SDValue Value, Overflow;
SDLoc DL(Op);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
unsigned Opc = 0;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Unknown overflow instruction!");
case ISD::SADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::VS;
break;
case ISD::UADDO:
Opc = AArch64ISD::ADDS;
CC = AArch64CC::HS;
break;
case ISD::SSUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::VS;
break;
case ISD::USUBO:
Opc = AArch64ISD::SUBS;
CC = AArch64CC::LO;
break;
// Multiply needs a little bit extra work.
case ISD::SMULO:
case ISD::UMULO: {
CC = AArch64CC::NE;
bool IsSigned = Op.getOpcode() == ISD::SMULO;
if (Op.getValueType() == MVT::i32) {
unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
// For a 32 bit multiply with overflow check we want the instruction
// selector to generate a widening multiply (SMADDL/UMADDL). For that we
// need to generate the following pattern:
// (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
DAG.getConstant(0, DL, MVT::i64));
// On AArch64 the upper 32 bits are always zero extended for a 32 bit
// operation. We need to clear out the upper 32 bits, because we used a
// widening multiply that wrote all 64 bits. In the end this should be a
// noop.
Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
if (IsSigned) {
// The signed overflow check requires more than just a simple check for
// any bit set in the upper 32 bits of the result. These bits could be
// just the sign bits of a negative number. To perform the overflow
// check we have to arithmetic shift right the 32nd bit of the result by
// 31 bits. Then we compare the result to the upper 32 bits.
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
DAG.getConstant(32, DL, MVT::i64));
UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
DAG.getConstant(31, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
// The overflow check for unsigned multiply is easy. We only need to
// check if any of the upper 32 bits are set. This can be done with a
// CMP (shifted register). For that we need to generate the following
// pattern:
// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
DAG.getConstant(32, DL, MVT::i64));
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
UpperBits).getValue(1);
}
break;
}
assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
// For the 64 bit multiply
Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
if (IsSigned) {
SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
DAG.getConstant(63, DL, MVT::i64));
// It is important that LowerBits is last, otherwise the arithmetic
// shift will not be folded into the compare (SUBS).
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
.getValue(1);
} else {
SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
Overflow =
DAG.getNode(AArch64ISD::SUBS, DL, VTs,
DAG.getConstant(0, DL, MVT::i64),
UpperBits).getValue(1);
}
break;
}
} // switch (...)
if (Opc) {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
// Emit the AArch64 operation with overflow check.
Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
Overflow = Value.getValue(1);
}
return std::make_pair(Value, Overflow);
}
SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
}
static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
SDValue Sel = Op.getOperand(0);
SDValue Other = Op.getOperand(1);
// If neither operand is a SELECT_CC, give up.
if (Sel.getOpcode() != ISD::SELECT_CC)
std::swap(Sel, Other);
if (Sel.getOpcode() != ISD::SELECT_CC)
return Op;
// The folding we want to perform is:
// (xor x, (select_cc a, b, cc, 0, -1) )
// -->
// (csel x, (xor x, -1), cc ...)
//
// The latter will get matched to a CSINV instruction.
ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
SDValue LHS = Sel.getOperand(0);
SDValue RHS = Sel.getOperand(1);
SDValue TVal = Sel.getOperand(2);
SDValue FVal = Sel.getOperand(3);
SDLoc dl(Sel);
// FIXME: This could be generalized to non-integer comparisons.
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return Op;
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
// The values aren't constants, this isn't the pattern we're looking for.
if (!CFVal || !CTVal)
return Op;
// We can commute the SELECT_CC by inverting the condition. This
// might be needed to make this fit into a CSINV pattern.
if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
// If the constants line up, perform the transform!
if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
FVal = Other;
TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
DAG.getConstant(-1ULL, dl, Other.getValueType()));
return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
CCVal, Cmp);
}
return Op;
}
static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
unsigned Opc;
bool ExtraOp = false;
switch (Op.getOpcode()) {
default:
llvm_unreachable("Invalid code");
case ISD::ADDC:
Opc = AArch64ISD::ADDS;
break;
case ISD::SUBC:
Opc = AArch64ISD::SUBS;
break;
case ISD::ADDE:
Opc = AArch64ISD::ADCS;
ExtraOp = true;
break;
case ISD::SUBE:
Opc = AArch64ISD::SBCS;
ExtraOp = true;
break;
}
if (!ExtraOp)
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2));
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
// Let legalize expand this if it isn't a legal type yet.
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
return SDValue();
SDLoc dl(Op);
AArch64CC::CondCode CC;
// The actual operation that sets the overflow or carry flag.
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
// We use 0 and 1 as false and true values.
SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
// We use an inverted condition, because the conditional select is inverted
// too. This will allow it to be selected to a single instruction:
// CSINC Wd, WZR, WZR, invert(cond).
SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
CCVal, Overflow);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
}
// Prefetch operands are:
// 1: Address to prefetch
// 2: bool isWrite
// 3: int locality (0 = no locality ... 3 = extreme locality)
// 4: bool isDataCache
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
bool IsStream = !Locality;
// When the locality number is set
if (Locality) {
// The front-end should have filtered out the out-of-range values
assert(Locality <= 3 && "Prefetch locality out-of-range");
// The locality degree is the opposite of the cache speed.
// Put the number the other way around.
// The encoding starts at 0 for level 1
Locality = 3 - Locality;
}
// built the mask value encoding the expected behavior.
unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
(!IsData << 3) | // IsDataCache bit
(Locality << 1) | // Cache level bits
(unsigned)IsStream; // Stream bit
return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
}
SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
RTLIB::Libcall LC;
LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getOperand(0).getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
SDValue SrcVal = Op.getOperand(0);
return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /*isSigned*/ false,
SDLoc(Op)).first;
}
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT InVT = Op.getOperand(0).getValueType();
EVT VT = Op.getValueType();
unsigned NumElts = InVT.getVectorNumElements();
// f16 vectors are promoted to f32 before a conversion.
if (InVT.getVectorElementType() == MVT::f16) {
MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
}
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
SDLoc dl(Op);
SDValue Cv =
DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
Op.getOperand(0));
return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
}
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
SDLoc dl(Op);
MVT ExtVT =
MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
VT.getVectorNumElements());
SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
}
// Type changing conversions are illegal.
return Op;
}
SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getOperand(0).getValueType().isVector())
return LowerVectorFP_TO_INT(Op, DAG);
// f16 conversions are promoted to f32.
if (Op.getOperand(0).getValueType() == MVT::f16) {
SDLoc dl(Op);
return DAG.getNode(
Op.getOpcode(), dl, Op.getValueType(),
DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
}
if (Op.getOperand(0).getValueType() != MVT::f128) {
// It's legal except when f128 is involved
return Op;
}
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::FP_TO_SINT)
LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
else
LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
}
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
// Any additional optimization in this function should be recorded
// in the cost tables.
EVT VT = Op.getValueType();
SDLoc dl(Op);
SDValue In = Op.getOperand(0);
EVT InVT = In.getValueType();
if (VT.getSizeInBits() < InVT.getSizeInBits()) {
MVT CastVT =
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
InVT.getVectorNumElements());
In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
}
if (VT.getSizeInBits() > InVT.getSizeInBits()) {
unsigned CastOpc =
Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
EVT CastVT = VT.changeVectorElementTypeToInteger();
In = DAG.getNode(CastOpc, dl, CastVT, In);
return DAG.getNode(Op.getOpcode(), dl, VT, In);
}
return Op;
}
SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVectorINT_TO_FP(Op, DAG);
// f16 conversions are promoted to f32.
if (Op.getValueType() == MVT::f16) {
SDLoc dl(Op);
return DAG.getNode(
ISD::FP_ROUND, dl, MVT::f16,
DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
DAG.getIntPtrConstant(0, dl));
}
// i128 conversions are libcalls.
if (Op.getOperand(0).getValueType() == MVT::i128)
return SDValue();
// Other conversions are legal, unless it's to the completely software-based
// fp128.
if (Op.getValueType() != MVT::f128)
return Op;
RTLIB::Libcall LC;
if (Op.getOpcode() == ISD::SINT_TO_FP)
LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
else
LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
return LowerF128Call(Op, DAG, LC);
}
SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
SelectionDAG &DAG) const {
// For iOS, we want to call an alternative entry point: __sincos_stret,
// which returns the values in two S / D registers.
SDLoc dl(Op);
SDValue Arg = Op.getOperand(0);
EVT ArgVT = Arg.getValueType();
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
ArgListTy Args;
ArgListEntry Entry;
Entry.Node = Arg;
Entry.Ty = ArgTy;
Entry.isSExt = false;
Entry.isZExt = false;
Args.push_back(Entry);
const char *LibcallName =
(ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
SDValue Callee =
DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
.setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
return CallResult.first;
}
static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
if (Op.getValueType() != MVT::f16)
return SDValue();
assert(Op.getOperand(0).getValueType() == MVT::i16);
SDLoc DL(Op);
Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
return SDValue(
DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
}
static EVT getExtensionTo64Bits(const EVT &OrigVT) {
if (OrigVT.getSizeInBits() >= 64)
return OrigVT;
assert(OrigVT.isSimple() && "Expecting a simple value type");
MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
switch (OrigSimpleTy) {
default: llvm_unreachable("Unexpected Vector Type");
case MVT::v2i8:
case MVT::v2i16:
return MVT::v2i32;
case MVT::v4i8:
return MVT::v4i16;
}
}
static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
const EVT &OrigTy,
const EVT &ExtTy,
unsigned ExtOpcode) {
// The vector originally had a size of OrigTy. It was then extended to ExtTy.
// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
// 64-bits we need to insert a new extension so that it will be 64-bits.
assert(ExtTy.is128BitVector() && "Unexpected extension size");
if (OrigTy.getSizeInBits() >= 64)
return N;
// Must extend size to at least 64 bits to be used as an operand for VMULL.
EVT NewVT = getExtensionTo64Bits(OrigTy);
return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
}
static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
bool isSigned) {
EVT VT = N->getValueType(0);
if (N->getOpcode() != ISD::BUILD_VECTOR)
return false;
for (const SDValue &Elt : N->op_values()) {
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
unsigned EltSize = VT.getScalarSizeInBits();
unsigned HalfSize = EltSize / 2;
if (isSigned) {
if (!isIntN(HalfSize, C->getSExtValue()))
return false;
} else {
if (!isUIntN(HalfSize, C->getZExtValue()))
return false;
}
continue;
}
return false;
}
return true;
}
static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND)
return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
N->getOperand(0)->getValueType(0),
N->getValueType(0),
N->getOpcode());
assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
EVT VT = N->getValueType(0);
SDLoc dl(N);
unsigned EltSize = VT.getScalarSizeInBits() / 2;
unsigned NumElts = VT.getVectorNumElements();
MVT TruncVT = MVT::getIntegerVT(EltSize);
SmallVector<SDValue, 8> Ops;
for (unsigned i = 0; i != NumElts; ++i) {
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
const APInt &CInt = C->getAPIntValue();
// Element types smaller than 32 bits are not legal, so use i32 elements.
// The values are implicitly truncated so sext vs. zext doesn't matter.
Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
}
return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
}
static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::SIGN_EXTEND)
return true;
if (isExtendedBUILD_VECTOR(N, DAG, true))
return true;
return false;
}
static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
if (N->getOpcode() == ISD::ZERO_EXTEND)
return true;
if (isExtendedBUILD_VECTOR(N, DAG, false))
return true;
return false;
}
static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
}
return false;
}
static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
unsigned Opcode = N->getOpcode();
if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
SDNode *N0 = N->getOperand(0).getNode();
SDNode *N1 = N->getOperand(1).getNode();
return N0->hasOneUse() && N1->hasOneUse() &&
isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
}
return false;
}
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
// Multiplications are only custom-lowered for 128-bit vectors so that
// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
EVT VT = Op.getValueType();
assert(VT.is128BitVector() && VT.isInteger() &&
"unexpected type for custom-lowering ISD::MUL");
SDNode *N0 = Op.getOperand(0).getNode();
SDNode *N1 = Op.getOperand(1).getNode();
unsigned NewOpc = 0;
bool isMLA = false;
bool isN0SExt = isSignExtended(N0, DAG);
bool isN1SExt = isSignExtended(N1, DAG);
if (isN0SExt && isN1SExt)
NewOpc = AArch64ISD::SMULL;
else {
bool isN0ZExt = isZeroExtended(N0, DAG);
bool isN1ZExt = isZeroExtended(N1, DAG);
if (isN0ZExt && isN1ZExt)
NewOpc = AArch64ISD::UMULL;
else if (isN1SExt || isN1ZExt) {
// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
if (isN1SExt && isAddSubSExt(N0, DAG)) {
NewOpc = AArch64ISD::SMULL;
isMLA = true;
} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
NewOpc = AArch64ISD::UMULL;
isMLA = true;
} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
std::swap(N0, N1);
NewOpc = AArch64ISD::UMULL;
isMLA = true;
}
}
if (!NewOpc) {
if (VT == MVT::v2i64)
// Fall through to expand this. It is not legal.
return SDValue();
else
// Other vector multiplications are legal.
return Op;
}
}
// Legalize to a S/UMULL instruction
SDLoc DL(Op);
SDValue Op0;
SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
if (!isMLA) {
Op0 = skipExtensionForVectorMULL(N0, DAG);
assert(Op0.getValueType().is64BitVector() &&
Op1.getValueType().is64BitVector() &&
"unexpected types for extended operands to VMULL");
return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
}
// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
EVT Op1VT = Op1.getValueType();
return DAG.getNode(N0->getOpcode(), DL, VT,
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
DAG.getNode(NewOpc, DL, VT,
DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
}
SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDLoc dl(Op);
switch (IntNo) {
default: return SDValue(); // Don't custom lower most intrinsics.
case Intrinsic::thread_pointer: {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
}
case Intrinsic::aarch64_neon_smax:
return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umax:
return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_smin:
return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
case Intrinsic::aarch64_neon_umin:
return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
}
}
SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default:
llvm_unreachable("unimplemented operand");
return SDValue();
case ISD::BITCAST:
return LowerBITCAST(Op, DAG);
case ISD::GlobalAddress:
return LowerGlobalAddress(Op, DAG);
case ISD::GlobalTLSAddress:
return LowerGlobalTLSAddress(Op, DAG);
case ISD::SETCC:
return LowerSETCC(Op, DAG);
case ISD::BR_CC:
return LowerBR_CC(Op, DAG);
case ISD::SELECT:
return LowerSELECT(Op, DAG);
case ISD::SELECT_CC:
return LowerSELECT_CC(Op, DAG);
case ISD::JumpTable:
return LowerJumpTable(Op, DAG);
case ISD::ConstantPool:
return LowerConstantPool(Op, DAG);
case ISD::BlockAddress:
return LowerBlockAddress(Op, DAG);
case ISD::VASTART:
return LowerVASTART(Op, DAG);
case ISD::VACOPY:
return LowerVACOPY(Op, DAG);
case ISD::VAARG:
return LowerVAARG(Op, DAG);
case ISD::ADDC:
case ISD::ADDE:
case ISD::SUBC:
case ISD::SUBE:
return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::SADDO:
case ISD::UADDO:
case ISD::SSUBO:
case ISD::USUBO:
case ISD::SMULO:
case ISD::UMULO:
return LowerXALUO(Op, DAG);
case ISD::FADD:
return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
case ISD::FSUB:
return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
case ISD::FMUL:
return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
case ISD::FDIV:
return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FP_ROUND:
return LowerFP_ROUND(Op, DAG);
case ISD::FP_EXTEND:
return LowerFP_EXTEND(Op, DAG);
case ISD::FRAMEADDR:
return LowerFRAMEADDR(Op, DAG);
case ISD::RETURNADDR:
return LowerRETURNADDR(Op, DAG);
case ISD::INSERT_VECTOR_ELT:
return LowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::BUILD_VECTOR:
return LowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return LowerVECTOR_SHUFFLE(Op, DAG);
case ISD::EXTRACT_SUBVECTOR:
return LowerEXTRACT_SUBVECTOR(Op, DAG);
case ISD::SRA:
case ISD::SRL:
case ISD::SHL:
return LowerVectorSRA_SRL_SHL(Op, DAG);
case ISD::SHL_PARTS:
return LowerShiftLeftParts(Op, DAG);
case ISD::SRL_PARTS:
case ISD::SRA_PARTS:
return LowerShiftRightParts(Op, DAG);
case ISD::CTPOP:
return LowerCTPOP(Op, DAG);
case ISD::FCOPYSIGN:
return LowerFCOPYSIGN(Op, DAG);
case ISD::AND:
return LowerVectorAND(Op, DAG);
case ISD::OR:
return LowerVectorOR(Op, DAG);
case ISD::XOR:
return LowerXOR(Op, DAG);
case ISD::PREFETCH:
return LowerPREFETCH(Op, DAG);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return LowerINT_TO_FP(Op, DAG);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return LowerFP_TO_INT(Op, DAG);
case ISD::FSINCOS:
return LowerFSINCOS(Op, DAG);
case ISD::MUL:
return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);
}
}
//===----------------------------------------------------------------------===//
// Calling Convention Implementation
//===----------------------------------------------------------------------===//
#include "AArch64GenCallingConv.inc"
/// Selects the correct CCAssignFn for a given CallingConvention value.
CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
bool IsVarArg) const {
switch (CC) {
default:
llvm_unreachable("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
return CC_AArch64_GHC;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::PreserveMost:
case CallingConv::CXX_FAST_TLS:
case CallingConv::Swift:
if (!Subtarget->isTargetDarwin())
return CC_AArch64_AAPCS;
return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
}
}
CCAssignFn *
AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
}
SDValue AArch64TargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
// At this point, Ins[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeFormalArguments to pass in ValVT and
// LocVT.
unsigned NumArgs = Ins.size();
Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
unsigned CurArgIdx = 0;
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Ins[i].VT;
if (Ins[i].isOrigArg()) {
std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
CurArgIdx = Ins[i].getOrigArgIndex();
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
}
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res =
AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
assert(ArgLocs.size() == Ins.size());
SmallVector<SDValue, 16> ArgValues;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (Ins[i].Flags.isByVal()) {
// Byval is used for HFAs in the PCS, but the system should work in a
// non-compliant manner for larger structs.
EVT PtrVT = getPointerTy(DAG.getDataLayout());
int Size = Ins[i].Flags.getByValSize();
unsigned NumRegs = (Size + 7) / 8;
// FIXME: This works on big-endian for composite byvals, which are the common
// case. It should also work for fundamental types too.
unsigned FrameIdx =
MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
InVals.push_back(FrameIdxN);
continue;
}
if (VA.isRegLoc()) {
// Arguments stored in registers.
EVT RegVT = VA.getLocVT();
SDValue ArgValue;
const TargetRegisterClass *RC;
if (RegVT == MVT::i32)
RC = &AArch64::GPR32RegClass;
else if (RegVT == MVT::i64)
RC = &AArch64::GPR64RegClass;
else if (RegVT == MVT::f16)
RC = &AArch64::FPR16RegClass;
else if (RegVT == MVT::f32)
RC = &AArch64::FPR32RegClass;
else if (RegVT == MVT::f64 || RegVT.is64BitVector())
RC = &AArch64::FPR64RegClass;
else if (RegVT == MVT::f128 || RegVT.is128BitVector())
RC = &AArch64::FPR128RegClass;
else
llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
// Transform the arguments in physical registers into virtual ones.
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
// If this is an 8, 16 or 32-bit value, it is really passed promoted
// to 64 bits. Insert an assert[sz]ext to capture this, then
// truncate to the right size.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
break;
case CCValAssign::AExt:
case CCValAssign::SExt:
case CCValAssign::ZExt:
// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
// nodes after our lowering.
assert(RegVT == Ins[i].VT && "incorrect register location selected");
break;
}
InVals.push_back(ArgValue);
} else { // VA.isRegLoc()
assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
unsigned ArgOffset = VA.getLocMemOffset();
unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
uint32_t BEAlign = 0;
if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
!Ins[i].Flags.isInConsecutiveRegs())
BEAlign = 8 - ArgSize;
int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
// Create load nodes to retrieve arguments from the stack.
SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
SDValue ArgValue;
// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
MVT MemVT = VA.getValVT();
switch (VA.getLocInfo()) {
default:
break;
case CCValAssign::BCvt:
MemVT = VA.getLocVT();
break;
case CCValAssign::SExt:
ExtType = ISD::SEXTLOAD;
break;
case CCValAssign::ZExt:
ExtType = ISD::ZEXTLOAD;
break;
case CCValAssign::AExt:
ExtType = ISD::EXTLOAD;
break;
}
ArgValue = DAG.getExtLoad(
ExtType, DL, VA.getLocVT(), Chain, FIN,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
MemVT);
InVals.push_back(ArgValue);
}
}
// varargs
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
if (isVarArg) {
if (!Subtarget->isTargetDarwin()) {
// The AAPCS variadic function ABI is identical to the non-variadic
// one. As a result there may be more arguments in registers and we should
// save them for future reference.
saveVarArgRegisters(CCInfo, DAG, DL, Chain);
}
// This will point to the next argument passed via stack.
unsigned StackOffset = CCInfo.getNextStackOffset();
// We currently pass all varargs at 8-byte alignment.
StackOffset = ((StackOffset + 7) & ~7);
FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
}
unsigned StackArgSize = CCInfo.getNextStackOffset();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
// This is a non-standard ABI so by fiat I say we're allowed to make full
// use of the stack area to be popped, which must be aligned to 16 bytes in
// any case:
StackArgSize = alignTo(StackArgSize, 16);
// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
// a multiple of 16.
FuncInfo->setArgumentStackToRestore(StackArgSize);
// This realignment carries over to the available bytes below. Our own
// callers will guarantee the space is free by giving an aligned value to
// CALLSEQ_START.
}
// Even if we're not expected to free up the space, it's useful to know how
// much is there while considering tail calls (because we can reuse it).
FuncInfo->setBytesInStackArgArea(StackArgSize);
return Chain;
}
void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
SelectionDAG &DAG,
const SDLoc &DL,
SDValue &Chain) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SmallVector<SDValue, 8> MemOps;
static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
AArch64::X3, AArch64::X4, AArch64::X5,
AArch64::X6, AArch64::X7 };
static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
int GPRIdx = 0;
if (GPRSaveSize != 0) {
GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
MemOps.push_back(Store);
FIN =
DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
}
}
FuncInfo->setVarArgsGPRIndex(GPRIdx);
FuncInfo->setVarArgsGPRSize(GPRSaveSize);
if (Subtarget->hasFPARMv8()) {
static const MCPhysReg FPRArgRegs[] = {
AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
int FPRIdx = 0;
if (FPRSaveSize != 0) {
FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
SDValue Store = DAG.getStore(
Val.getValue(1), DL, Val, FIN,
MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
MemOps.push_back(Store);
FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
DAG.getConstant(16, DL, PtrVT));
}
}
FuncInfo->setVarArgsFPRIndex(FPRIdx);
FuncInfo->setVarArgsFPRSize(FPRSaveSize);
}
if (!MemOps.empty()) {
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
}
/// LowerCallResult - Lower the result values of a call into the
/// appropriate copies out of appropriate physical registers.
SDValue AArch64TargetLowering::LowerCallResult(
SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC);
// Copy all of the result registers out of their specified physreg.
for (unsigned i = 0; i != RVLocs.size(); ++i) {
CCValAssign VA = RVLocs[i];
// Pass 'this' value directly from the argument to return value, to avoid
// reg unit interference
if (i == 0 && isThisReturn) {
assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
"unexpected return calling convention register assignment");
InVals.push_back(ThisVal);
continue;
}
SDValue Val =
DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
Chain = Val.getValue(1);
InFlag = Val.getValue(2);
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::BCvt:
Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
break;
}
InVals.push_back(Val);
}
return Chain;
}
/// Return true if the calling convention is one that we can guarantee TCO for.
static bool canGuaranteeTCO(CallingConv::ID CC) {
return CC == CallingConv::Fast;
}
/// Return true if we might ever do TCO for calls with this calling convention.
static bool mayTailCallThisCC(CallingConv::ID CC) {
switch (CC) {
case CallingConv::C:
case CallingConv::PreserveMost:
case CallingConv::Swift:
return true;
default:
return canGuaranteeTCO(CC);
}
}
bool AArch64TargetLowering::isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
if (!mayTailCallThisCC(CalleeCC))
return false;
MachineFunction &MF = DAG.getMachineFunction();
const Function *CallerF = MF.getFunction();
CallingConv::ID CallerCC = CallerF->getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
// Byval parameters hand the function a pointer directly into the stack area
// we want to reuse during a tail call. Working around this *is* possible (see
// X86) but less efficient and uglier in LowerCall.
for (Function::const_arg_iterator i = CallerF->arg_begin(),
e = CallerF->arg_end();
i != e; ++i)
if (i->hasByValAttr())
return false;
if (getTargetMachine().Options.GuaranteedTailCallOpt)
return canGuaranteeTCO(CalleeCC) && CCMatch;
// Externally-defined functions with weak linkage should not be
// tail-called on AArch64 when the OS does not support dynamic
// pre-emption of symbols, as the AAELF spec requires normal calls
// to undefined weak functions to be replaced with a NOP or jump to the
// next instruction. The behaviour of branch instructions in this
// situation (as used for tail calls) is implementation-defined, so we
// cannot rely on the linker replacing the tail call with a return.
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
(!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
return false;
}
// Now we search for cases where we can use a tail call without changing the
// ABI. Sibcall is used in some places (particularly gcc) to refer to this
// concept.
// I want anyone implementing a new calling convention to think long and hard
// about this assert.
assert((!isVarArg || CalleeCC == CallingConv::C) &&
"Unexpected variadic calling convention");
LLVMContext &C = *DAG.getContext();
if (isVarArg && !Outs.empty()) {
// At least two cases here: if caller is fastcc then we can't have any
// memory arguments (we'd be expected to clean up the stack afterwards). If
// caller is C then we could potentially use its argument area.
// FIXME: for now we take the most conservative of these in both cases:
// disallow all variadic memory operands.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
for (const CCValAssign &ArgLoc : ArgLocs)
if (!ArgLoc.isRegLoc())
return false;
}
// Check that the call results are passed in the same way.
if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
CCAssignFnForCall(CalleeCC, isVarArg),
CCAssignFnForCall(CallerCC, isVarArg)))
return false;
// The callee has to preserve all registers the caller needs to preserve.
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (!CCMatch) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
return false;
}
// Nothing more to check if the callee is taking no arguments
if (Outs.empty())
return true;
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
return false;
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
return false;
return true;
}
SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
SelectionDAG &DAG,
MachineFrameInfo &MFI,
int ClobberedFI) const {
SmallVector<SDValue, 8> ArgChains;
int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
// Include the original chain at the beginning of the list. When this is
// used by target LowerCall hooks, this helps legalize find the
// CALLSEQ_BEGIN node.
ArgChains.push_back(Chain);
// Add a chain value for each stack argument corresponding
for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
UE = DAG.getEntryNode().getNode()->use_end();
U != UE; ++U)
if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
if (FI->getIndex() < 0) {
int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
int64_t InLastByte = InFirstByte;
InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
(FirstByte <= InFirstByte && InFirstByte <= LastByte))
ArgChains.push_back(SDValue(L, 1));
}
// Build a tokenfactor for all the chains.
return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
}
bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
bool TailCallOpt) const {
return CallCC == CallingConv::Fast && TailCallOpt;
}
/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
/// and add input and output parameter nodes.
SDValue
AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
SelectionDAG &DAG = CLI.DAG;
SDLoc &DL = CLI.DL;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
MachineFunction &MF = DAG.getMachineFunction();
bool IsThisReturn = false;
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
bool IsSibCall = false;
if (IsTailCall) {
// Check if it's really possible to do a tail call.
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
report_fatal_error("failed to perform tail call elimination on a call "
"site marked musttail");
// A sibling call is one where we're under the usual C ABI and not planning
// to change that but can still do a tail call:
if (!TailCallOpt && IsTailCall)
IsSibCall = true;
if (IsTailCall)
++NumTailCalls;
}
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
*DAG.getContext());
if (IsVarArg) {
// Handle fixed and variable vector arguments differently.
// Variable vector arguments always go into memory.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ArgVT = Outs[i].VT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
/*IsVarArg=*/ !Outs[i].IsFixed);
bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
} else {
// At this point, Outs[].VT may already be promoted to i32. To correctly
// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
// we use a special version of AnalyzeCallOperands to pass in ValVT and
// LocVT.
unsigned NumArgs = Outs.size();
for (unsigned i = 0; i != NumArgs; ++i) {
MVT ValVT = Outs[i].VT;
// Get type of the original argument.
EVT ActualVT = getValueType(DAG.getDataLayout(),
CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
/*AllowUnknown*/ true);
MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
ValVT = MVT::i8;
else if (ActualMVT == MVT::i16)
ValVT = MVT::i16;
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
assert(!Res && "Call operand has unhandled type");
(void)Res;
}
}
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
// operands are already available in the caller's incoming argument space.
NumBytes = 0;
}
// FPDiff is the byte offset of the call's argument area from the callee's.
// Stores to callee stack arguments will be placed in FixedStackSlots offset
// by this amount for a tail call. In a sibling call it must be 0 because the
// caller will deallocate the entire stack and the callee still expects its
// arguments to begin at SP+0. Completely unused for non-tail calls.
int FPDiff = 0;
if (IsTailCall && !IsSibCall) {
unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
// Since callee will pop argument stack as a tail call, we must keep the
// popped size 16-byte aligned.
NumBytes = alignTo(NumBytes, 16);
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
// can actually shrink the stack.
FPDiff = NumReusableBytes - NumBytes;
// The stack pointer must be 16-byte aligned at all times it's used for a
// memory operation, which in practice means at *all* times and in
// particular across call boundaries. Therefore our own arguments started at
// a 16-byte aligned SP and the delta applied for the tail call should
// satisfy the same constraint.
assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
}
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL,
true),
DL);
SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
getPointerTy(DAG.getDataLayout()));
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
SmallVector<SDValue, 8> MemOpChains;
auto PtrVT = getPointerTy(DAG.getDataLayout());
// Walk the register/memloc assignments, inserting copies/loads.
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
// Promote the value if needed.
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
break;
case CCValAssign::SExt:
Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::ZExt:
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
if (Outs[realArgIdx].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
}
Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::FPExt:
Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
break;
}
if (VA.isRegLoc()) {
if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
assert(VA.getLocVT() == MVT::i64 &&
"unexpected calling convention register assignment");
assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
"unexpected use of 'returned'");
IsThisReturn = true;
}
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
} else {
assert(VA.isMemLoc());
SDValue DstAddr;
MachinePointerInfo DstInfo;
// FIXME: This works on big-endian for composite byvals, which are the
// common case. It should also work for fundamental types too.
uint32_t BEAlign = 0;
unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
: VA.getValVT().getSizeInBits();
OpSize = (OpSize + 7) / 8;
if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
!Flags.isInConsecutiveRegs()) {
if (OpSize < 8)
BEAlign = 8 - OpSize;
}
unsigned LocMemOffset = VA.getLocMemOffset();
int32_t Offset = LocMemOffset + BEAlign;
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
if (IsTailCall) {
Offset = Offset + FPDiff;
int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
DstAddr = DAG.getFrameIndex(FI, PtrVT);
DstInfo =
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
// Make sure any stack arguments overlapping with where we're storing
// are loaded before this eventual operation. Otherwise they'll be
// clobbered.
Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
} else {
SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
SDValue SizeNode =
DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
SDValue Cpy = DAG.getMemcpy(
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ false,
/*isTailCall = */ false,
DstInfo, MachinePointerInfo());
MemOpChains.push_back(Cpy);
} else {
// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
// promoted to a legal register type i32, we should truncate Arg back to
// i1/i8/i16.
if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
VA.getValVT() == MVT::i16)
Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
MemOpChains.push_back(Store);
}
}
}
if (!MemOpChains.empty())
Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
SDValue InFlag;
for (auto &RegToPass : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
RegToPass.second, InFlag);
InFlag = Chain.getValue(1);
}
// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
// node so that legalize doesn't hack it.
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
Subtarget->isTargetMachO()) {
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
bool InternalLinkage = GV->hasInternalLinkage();
if (InternalLinkage)
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
else {
Callee =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
}
} else if (ExternalSymbolSDNode *S =
dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
}
} else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
const GlobalValue *GV = G->getGlobal();
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
const char *Sym = S->getSymbol();
Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
}
// We don't usually want to end the call-sequence here because we would tidy
// the frame up *after* the call, however in the ABI-changing tail-call case
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
InFlag = Chain.getValue(1);
}
std::vector<SDValue> Ops;
Ops.push_back(Chain);
Ops.push_back(Callee);
if (IsTailCall) {
// Each tail call may have to adjust the stack by a different amount, so
// this information must travel along with the operation for eventual
// consumption by emitEpilogue.
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
}
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass)
Ops.push_back(DAG.getRegister(RegToPass.first,
RegToPass.second.getValueType()));
// Add a register mask operand representing the call-preserved registers.
const uint32_t *Mask;
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
if (IsThisReturn) {
// For 'this' returns, use the X0-preserving mask if applicable
Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
if (!Mask) {
IsThisReturn = false;
Mask = TRI->getCallPreservedMask(MF, CallConv);
}
} else
Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
if (InFlag.getNode())
Ops.push_back(InFlag);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
// If we're doing a tall call, use a TC_RETURN here rather than an
// actual call instruction.
if (IsTailCall) {
MF.getFrameInfo().setHasTailCall();
return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
}
// Returns a chain and a flag for retval copy to use.
Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
InFlag = Chain.getValue(1);
uint64_t CalleePopBytes =
DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
DAG.getIntPtrConstant(CalleePopBytes, DL, true),
InFlag, DL);
if (!Ins.empty())
InFlag = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
}
bool AArch64TargetLowering::CanLowerReturn(
CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
return CCInfo.CheckReturn(Outs, RetCC);
}
SDValue
AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
? RetCC_AArch64_WebKit_JS
: RetCC_AArch64_AAPCS;
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC);
// Copy the result values into the output registers.
SDValue Flag;
SmallVector<SDValue, 4> RetOps(1, Chain);
for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
++i, ++realRVLocIdx) {
CCValAssign &VA = RVLocs[i];
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
switch (VA.getLocInfo()) {
default:
llvm_unreachable("Unknown loc info!");
case CCValAssign::Full:
if (Outs[i].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to i8 by the producer of the
// value. This is strictly redundant on Darwin (which uses "zeroext
// i1"), but will be optimised out before ISel.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
}
break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
break;
}
Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
if (I) {
for (; *I; ++I) {
if (AArch64::GPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::i64));
else if (AArch64::FPR64RegClass.contains(*I))
RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
}
}
RetOps[0] = Chain; // Update chain.
// Add the flag if we have it.
if (Flag.getNode())
RetOps.push_back(Flag);
return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
}
//===----------------------------------------------------------------------===//
// Other Lowering Code
//===----------------------------------------------------------------------===//
SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = GN->getGlobal();
unsigned char OpFlags =
Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
"unexpected offset in global node");
// This also catched the large code model case for Darwin.
if ((OpFlags & AArch64II::MO_GOT) != 0) {
SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
// FIXME: Once remat is capable of dealing with instructions with register
// operands, expand this into two nodes instead of using a wrapper node.
return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
}
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, PtrVT,
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
} else {
// Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
// the only correct model on Darwin.
SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
OpFlags | AArch64II::MO_PAGE);
unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
}
/// \brief Convert a TLS address reference into the correct sequence of loads
/// and calls to compute the variable's address (for Darwin, currently) and
/// return an SDValue containing the final node.
/// Darwin only has one TLS scheme which must be capable of dealing with the
/// fully general situation, in the worst case. This means:
/// + "extern __thread" declaration.
/// + Defined in a possibly unknown dynamic library.
///
/// The general system is that each __thread variable has a [3 x i64] descriptor
/// which contains information used by the runtime to calculate the address. The
/// only part of this the compiler needs to know about is the first xword, which
/// contains a function pointer that must be called with the address of the
/// entire descriptor in "x0".
///
/// Since this descriptor may be in a different unit, in general even the
/// descriptor must be accessed via an indirect load. The "ideal" code sequence
/// is:
/// adrp x0, _var@TLVPPAGE
/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
/// ; the function pointer
/// blr x1 ; Uses descriptor address in x0
/// ; Address of _var is now in x0.
///
/// If the address of _var's descriptor *is* known to the linker, then it can
/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
/// a slight efficiency gain.
SDValue
AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
SDLoc DL(Op);
MVT PtrVT = getPointerTy(DAG.getDataLayout());
const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
SDValue TLVPAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
// The first entry in the descriptor is a function pointer that we must call
// to obtain the address of the variable.
SDValue Chain = DAG.getEntryNode();
SDValue FuncTLVGet = DAG.getLoad(
MVT::i64, DL, Chain, DescAddr,
MachinePointerInfo::getGOT(DAG.getMachineFunction()),
/* Alignment = */ 8,
MachineMemOperand::MONonTemporal | MachineMemOperand::MOInvariant |
MachineMemOperand::MODereferenceable);
Chain = FuncTLVGet.getValue(1);
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setAdjustsStack(true);
// TLS calls preserve all registers except those that absolutely must be
// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
// silly).
const uint32_t *Mask =
Subtarget->getRegisterInfo()->getTLSCallPreservedMask();
// Finally, we can make the call. This is just a degenerate version of a
// normal AArch64 call node: x0 takes the address of the descriptor, and
// returns the address of the variable in this thread.
Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
Chain =
DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
DAG.getRegisterMask(Mask), Chain.getValue(1));
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
}
/// When accessing thread-local variables under either the general-dynamic or
/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
/// is a function pointer to carry out the resolution.
///
/// The sequence is:
/// adrp x0, :tlsdesc:var
/// ldr x1, [x0, #:tlsdesc_lo12:var]
/// add x0, x0, #:tlsdesc_lo12:var
/// .tlsdesccall var
/// blr x1
/// (TPIDR_EL0 offset now in x0)
///
/// The above sequence must be produced unscheduled, to enable the linker to
/// optimize/relax this sequence.
/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
/// above sequence, and expanded really late in the compilation flow, to ensure
/// the sequence is produced as per above.
SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
const SDLoc &DL,
SelectionDAG &DAG) const {
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDValue Chain = DAG.getEntryNode();
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
Chain =
DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
SDValue Glue = Chain.getValue(1);
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
}
SDValue
AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
"ELF TLS only supported in small memory model");
// Different choices can be made for the maximum size of the TLS area for a
// module. For the small address model, the default TLS size is 16MiB and the
// maximum TLS size is 4GiB.
// FIXME: add -mtls-size command line option and make it control the 16MiB
// vs. 4GiB code sequence generation.
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
if (DAG.getTarget().Options.EmulatedTLS)
return LowerToTLSEmulatedModel(GA, DAG);
if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
if (Model == TLSModel::LocalDynamic)
Model = TLSModel::GeneralDynamic;
}
SDValue TPOff;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
const GlobalValue *GV = GA->getGlobal();
SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
if (Model == TLSModel::LocalExec) {
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, PtrVT, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue TPWithOff_lo =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
HiVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
SDValue TPWithOff =
SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
return TPWithOff;
} else if (Model == TLSModel::InitialExec) {
TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
} else if (Model == TLSModel::LocalDynamic) {
// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
// the beginning of the module's TLS region, followed by a DTPREL offset
// calculation.
// These accesses will need deduplicating if there's more than one.
AArch64FunctionInfo *MFI =
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
MFI->incNumLocalDynamicTLSAccesses();
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
AArch64II::MO_TLS);
// Now we can calculate the offset from TPIDR_EL0 to this module's
// thread-local area.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
// Now use :dtprel_whatever: operations to calculate this variable's offset
// in its thread-storage area.
SDValue HiVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
SDValue LoVar = DAG.getTargetGlobalAddress(
GV, DL, MVT::i64, 0,
AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
DAG.getTargetConstant(0, DL, MVT::i32)),
0);
} else if (Model == TLSModel::GeneralDynamic) {
// The call needs a relocation too for linker relaxation. It doesn't make
// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
// the address.
SDValue SymAddr =
DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
// Finally we can make a call to calculate the offset from tpidr_el0.
TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
} else
llvm_unreachable("Unsupported ELF TLS access model");
return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
}
SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
if (Subtarget->isTargetDarwin())
return LowerDarwinGlobalTLSAddress(Op, DAG);
else if (Subtarget->isTargetELF())
return LowerELFGlobalTLSAddress(Op, DAG);
llvm_unreachable("Unexpected platform trying to use TLS");
}
SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
SDValue Chain = Op.getOperand(0);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
SDValue LHS = Op.getOperand(2);
SDValue RHS = Op.getOperand(3);
SDValue Dest = Op.getOperand(4);
SDLoc dl(Op);
// Handle f128 first, since lowering it will result in comparing the return
// value of a libcall against zero, which is just what the rest of LowerBR_CC
// is expecting to deal with.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
// instruction.
unsigned Opc = LHS.getOpcode();
if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
"Unexpected condition code.");
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
return SDValue();
// The actual operation with overflow check.
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
if (CC == ISD::SETNE)
OFCC = getInvertedCondCode(OFCC);
SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
Overflow);
}
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
// If the RHS of the comparison is zero, we can potentially fold this
// to a specialized branch.
const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
if (RHSC && RHSC->getZExtValue() == 0) {
if (CC == ISD::SETEQ) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
Dest);
}
return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETNE) {
// See if we can use a TBZ to fold in an AND as well.
// TBZ has a smaller branch displacement than CBZ. If the offset is
// out of bounds, a late MI-layer pass rewrites branches.
// 403.gcc is an example that hits this case.
if (LHS.getOpcode() == ISD::AND &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
isPowerOf2_64(LHS.getConstantOperandVal(1))) {
SDValue Test = LHS.getOperand(0);
uint64_t Mask = LHS.getConstantOperandVal(1);
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
Dest);
}
return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
}
if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
LHS.getOpcode() != ISD::AND) {
// Don't combine AND since emitComparison converts the AND to an ANDS
// (a.k.a. TST) and the test in the test bit and branch instruction
// becomes redundant. This would also increase register pressure.
uint64_t Mask = LHS.getValueSizeInBits() - 1;
return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
DAG.getConstant(Mask, dl, MVT::i64), Dest);
}
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
Cmp);
}
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue BR1 =
DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
Cmp);
}
return BR1;
}
SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue In1 = Op.getOperand(0);
SDValue In2 = Op.getOperand(1);
EVT SrcVT = In2.getValueType();
if (SrcVT.bitsLT(VT))
In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
else if (SrcVT.bitsGT(VT))
In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));
EVT VecVT;
EVT EltVT;
uint64_t EltMask;
SDValue VecVal1, VecVal2;
if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
EltVT = MVT::i32;
VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
EltMask = 0x80000000ULL;
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
DAG.getUNDEF(VecVT), In1);
VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
DAG.getUNDEF(VecVT), In2);
} else {
VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
}
} else if (VT == MVT::f64 || VT == MVT::v2f64) {
EltVT = MVT::i64;
VecVT = MVT::v2i64;
// We want to materialize a mask with the high bit set, but the AdvSIMD
// immediate moves cannot materialize that in a single instruction for
// 64-bit elements. Instead, materialize zero and then negate it.
EltMask = 0;
if (!VT.isVector()) {
VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
DAG.getUNDEF(VecVT), In1);
VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
DAG.getUNDEF(VecVT), In2);
} else {
VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
}
} else {
llvm_unreachable("Invalid type for copysign!");
}
SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);
// If we couldn't materialize the mask above, then the mask vector will be
// the zero vector, and we need to negate it here.
if (VT == MVT::f64 || VT == MVT::v2f64) {
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
}
SDValue Sel =
DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
if (VT == MVT::f32)
return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
else if (VT == MVT::f64)
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
else
return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
}
SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
Attribute::NoImplicitFloat))
return SDValue();
if (!Subtarget->hasNEON())
return SDValue();
// While there is no integer popcount instruction, it can
// be more efficiently lowered to the following sequence that uses
// AdvSIMD registers/instructions as long as the copies to/from
// the AdvSIMD registers are cheap.
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
// CNT V0.8B, V0.8B // 8xbyte pop-counts
// ADDV B0, V0.8B // sum 8xbyte pop-counts
// UMOV X0, V0.B[0] // copy byte result back to integer reg
SDValue Val = Op.getOperand(0);
SDLoc DL(Op);
EVT VT = Op.getValueType();
if (VT == MVT::i32)
Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
SDValue UaddLV = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
if (VT == MVT::i64)
UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
return UaddLV;
}
SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (Op.getValueType().isVector())
return LowerVSETCC(Op, DAG);
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDLoc dl(Op);
// We chose ZeroOrOneBooleanContents, so use zero and one.
EVT VT = Op.getValueType();
SDValue TVal = DAG.getConstant(1, dl, VT);
SDValue FVal = DAG.getConstant(0, dl, VT);
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets picked up by the next if statement.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, use it.
if (!RHS.getNode()) {
assert(LHS.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
return LHS;
}
}
if (LHS.getValueType().isInteger()) {
SDValue CCVal;
SDValue Cmp =
getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
}
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
// and do the comparison.
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (CC2 == AArch64CC::AL) {
changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
// matched to a single CSINC instruction.
return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
} else {
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
// totally clean. Some of them require two CSELs to implement. As is in
// this case, we emit the first CSEL and then emit a second using the output
// of the first as the RHS. We're effectively OR'ing the two CC's together.
// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 =
DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
}
SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
SDValue RHS, SDValue TVal,
SDValue FVal, const SDLoc &dl,
SelectionDAG &DAG) const {
// Handle f128 first, because it will result in a comparison of some RTLIB
// call result against zero.
if (LHS.getValueType() == MVT::f128) {
softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
// If softenSetCCOperands returned a scalar, we need to compare the result
// against zero to select between true and false values.
if (!RHS.getNode()) {
RHS = DAG.getConstant(0, dl, LHS.getValueType());
CC = ISD::SETNE;
}
}
// Also handle f16, for which we need to do a f32 comparison.
if (LHS.getValueType() == MVT::f16) {
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
}
// Next, handle integers.
if (LHS.getValueType().isInteger()) {
assert((LHS.getValueType() == RHS.getValueType()) &&
(LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
unsigned Opcode = AArch64ISD::CSEL;
// If both the TVal and the FVal are constants, see if we can swap them in
// order to for a CSINV or CSINC out of them.
ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
// that we can match with a CSNEG rather than a CSEL.
if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
} else if (CTVal && CFVal) {
const int64_t TrueVal = CTVal->getSExtValue();
const int64_t FalseVal = CFVal->getSExtValue();
bool Swap = false;
// If both TVal and FVal are constants, see if FVal is the
// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
// instead of a CSEL in that case.
if (TrueVal == ~FalseVal) {
Opcode = AArch64ISD::CSINV;
} else if (TrueVal == -FalseVal) {
Opcode = AArch64ISD::CSNEG;
} else if (TVal.getValueType() == MVT::i32) {
// If our operands are only 32-bit wide, make sure we use 32-bit
// arithmetic for the check whether we can use CSINC. This ensures that
// the addition in the check will wrap around properly in case there is
// an overflow (which would not be the case if we do the check with
// 64-bit arithmetic).
const uint32_t TrueVal32 = CTVal->getZExtValue();
const uint32_t FalseVal32 = CFVal->getZExtValue();
if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal32 > FalseVal32) {
Swap = true;
}
}
// 64-bit check whether we can use CSINC.
} else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
Opcode = AArch64ISD::CSINC;
if (TrueVal > FalseVal) {
Swap = true;
}
}
// Swap TVal and FVal if necessary.
if (Swap) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
CC = ISD::getSetCCInverse(CC, true);
}
if (Opcode != AArch64ISD::CSEL) {
// Drop FVal since we can get its value by simply inverting/negating
// TVal.
FVal = TVal;
}
}
// Avoid materializing a constant when possible by reusing a known value in
// a register. However, don't perform this optimization if the known value
// is one, zero or negative one in the case of a CSEL. We can always
// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
// FVal, respectively.
ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
// "a != C ? x : a" to avoid materializing C.
if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
TVal = LHS;
else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
FVal = LHS;
} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
// avoid materializing C.
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
Opcode = AArch64ISD::CSINV;
TVal = LHS;
FVal = DAG.getConstant(0, dl, FVal.getValueType());
}
}
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
EVT VT = TVal.getValueType();
return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
}
// Now we know we're dealing with FP values.
assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
assert(LHS.getValueType() == RHS.getValueType());
EVT VT = TVal.getValueType();
SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two CSELs to implement.
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (DAG.getTarget().Options.UnsafeFPMath) {
// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
if (RHSVal && RHSVal->isZero()) {
ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
TVal = LHS;
else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
CFVal && CFVal->isZero() &&
FVal.getValueType() == LHS.getValueType())
FVal = LHS;
}
}
// Emit first, and possibly only, CSEL.
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
// If we need a second CSEL, emit it, using the output of the first as the
// RHS. We're effectively OR'ing the two CC's together.
if (CC2 != AArch64CC::AL) {
SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
}
// Otherwise, return the output of the first CSEL.
return CS1;
}
SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
SDValue TVal = Op.getOperand(2);
SDValue FVal = Op.getOperand(3);
SDLoc DL(Op);
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
}
SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
SelectionDAG &DAG) const {
SDValue CCVal = Op->getOperand(0);
SDValue TVal = Op->getOperand(1);
SDValue FVal = Op->getOperand(2);
SDLoc DL(Op);
unsigned Opc = CCVal.getOpcode();
// Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
// instruction.
if (CCVal.getResNo() == 1 &&
(Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
// Only lower legal XALUO ops.
if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
return SDValue();
AArch64CC::CondCode OFCC;
SDValue Value, Overflow;
std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
CCVal, Overflow);
}
// Lower it the same way as we would lower a SELECT_CC node.
ISD::CondCode CC;
SDValue LHS, RHS;
if (CCVal.getOpcode() == ISD::SETCC) {
LHS = CCVal.getOperand(0);
RHS = CCVal.getOperand(1);
CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
} else {
LHS = CCVal;
RHS = DAG.getConstant(0, DL, CCVal.getValueType());
CC = ISD::SETNE;
}
return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
}
SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
SelectionDAG &DAG) const {
// Jump table entries as PC relative offsets. No additional tweaking
// is necessary here. Just get the address of the jump table.
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, PtrVT,
DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
AArch64II::MO_G0 | MO_NC));
}
SDValue Hi =
DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
SelectionDAG &DAG) const {
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large) {
// Use the GOT for the large code model on iOS.
if (Subtarget->isTargetMachO()) {
SDValue GotAddr = DAG.getTargetConstantPool(
CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
AArch64II::MO_GOT);
return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
}
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, PtrVT,
DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
CP->getOffset(), AArch64II::MO_G3),
DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
CP->getOffset(), AArch64II::MO_G2 | MO_NC),
DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
CP->getOffset(), AArch64II::MO_G1 | MO_NC),
DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
CP->getOffset(), AArch64II::MO_G0 | MO_NC));
} else {
// Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
// ELF, the only valid one on Darwin.
SDValue Hi =
DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
CP->getOffset(), AArch64II::MO_PAGE);
SDValue Lo = DAG.getTargetConstantPool(
CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
}
SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
SelectionDAG &DAG) const {
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
if (getTargetMachine().getCodeModel() == CodeModel::Large &&
!Subtarget->isTargetMachO()) {
const unsigned char MO_NC = AArch64II::MO_NC;
return DAG.getNode(
AArch64ISD::WrapperLarge, DL, PtrVT,
DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
} else {
SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
AArch64II::MO_NC);
SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
}
}
SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
SelectionDAG &DAG) const {
AArch64FunctionInfo *FuncInfo =
DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
SDLoc DL(Op);
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
getPointerTy(DAG.getDataLayout()));
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
MachinePointerInfo(SV));
}
SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
SelectionDAG &DAG) const {
// The layout of the va_list struct is specified in the AArch64 Procedure Call
// Standard, section B.3.
MachineFunction &MF = DAG.getMachineFunction();
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue VAList = Op.getOperand(1);
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
SmallVector<SDValue, 4> MemOps;
// void *__stack at offset 0
SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
MachinePointerInfo(SV), /* Alignment = */ 8));
// void *__gr_top at offset 8
int GPRSize = FuncInfo->getVarArgsGPRSize();
if (GPRSize > 0) {
SDValue GRTop, GRTopAddr;
GRTopAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));
GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
DAG.getConstant(GPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
MachinePointerInfo(SV, 8),
/* Alignment = */ 8));
}
// void *__vr_top at offset 16
int FPRSize = FuncInfo->getVarArgsFPRSize();
if (FPRSize > 0) {
SDValue VRTop, VRTopAddr;
VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(16, DL, PtrVT));
VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
DAG.getConstant(FPRSize, DL, PtrVT));
MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
MachinePointerInfo(SV, 16),
/* Alignment = */ 8));
}
// int __gr_offs at offset 24
SDValue GROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
MemOps.push_back(DAG.getStore(
Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
MachinePointerInfo(SV, 24), /* Alignment = */ 4));
// int __vr_offs at offset 28
SDValue VROffsAddr =
DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
MemOps.push_back(DAG.getStore(
Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
MachinePointerInfo(SV, 28), /* Alignment = */ 4));
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
}
SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
SelectionDAG &DAG) const {
return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
: LowerAAPCS_VASTART(Op, DAG);
}
SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
SelectionDAG &DAG) const {
// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
// pointer.
SDLoc DL(Op);
unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
Op.getOperand(2),
DAG.getConstant(VaListSize, DL, MVT::i32),
8, false, false, false, MachinePointerInfo(DestSV),
MachinePointerInfo(SrcSV));
}
SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->isTargetDarwin() &&
"automatic va_arg instruction only works on Darwin");
const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
EVT VT = Op.getValueType();
SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);
SDValue Addr = Op.getOperand(1);
unsigned Align = Op.getConstantOperandVal(3);
auto PtrVT = getPointerTy(DAG.getDataLayout());
SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
Chain = VAList.getValue(1);
if (Align > 8) {
assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(Align - 1, DL, PtrVT));
VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
DAG.getConstant(-(int64_t)Align, DL, PtrVT));
}
Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
// Scalar integer and FP values smaller than 64 bits are implicitly extended
// up to 64 bits. At the very least, we have to increase the striding of the
// vaargs list to match this, and for FP values we need to introduce
// FP_ROUND nodes as well.
if (VT.isInteger() && !VT.isVector())
ArgSize = 8;
bool NeedFPTrunc = false;
if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
ArgSize = 8;
NeedFPTrunc = true;
}
// Increment the pointer, VAList, to the next vaarg
SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
DAG.getConstant(ArgSize, DL, PtrVT));
// Store the incremented VAList to the legalized pointer
SDValue APStore =
DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
// Load the actual argument out of the pointer VAList
if (NeedFPTrunc) {
// Load the value as an f64.
SDValue WideFP =
DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
// Round the value down to an f32.
SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
DAG.getIntPtrConstant(1, DL));
SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
// Merge the rounded value with the chain output of the load.
return DAG.getMergeValues(Ops, DL);
}
return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
}
SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
MFI.setFrameAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
SDValue FrameAddr =
DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
while (Depth--)
FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
MachinePointerInfo());
return FrameAddr;
}
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
SelectionDAG &DAG) const {
unsigned Reg = StringSwitch<unsigned>(RegName)
.Case("sp", AArch64::SP)
.Default(0);
if (Reg)
return Reg;
report_fatal_error(Twine("Invalid register name \""
+ StringRef(RegName) + "\"."));
}
SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
MFI.setReturnAddressIsTaken(true);
EVT VT = Op.getValueType();
SDLoc DL(Op);
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
if (Depth) {
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
return DAG.getLoad(VT, DL, DAG.getEntryNode(),
DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
MachinePointerInfo());
}
// Return LR, which contains the return address. Mark it an implicit live-in.
unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
}
/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
ISD::SETEQ, dl, DAG);
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
HiBitsForLo =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
HiBitsForLo, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
SDValue LoForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
// AArch64 shifts larger than the register width are wrapped rather than
// clamped, so we can't just emit "hi >> x".
SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
SDValue HiForBigShift =
Opc == ISD::SRA
? DAG.getNode(Opc, dl, VT, ShOpHi,
DAG.getConstant(VTBits - 1, dl, MVT::i64))
: DAG.getConstant(0, dl, VT);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
/// i64 values and take a 2 x i64 value to shift plus a shift amount.
SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 3 && "Not a double-shift!");
EVT VT = Op.getValueType();
unsigned VTBits = VT.getSizeInBits();
SDLoc dl(Op);
SDValue ShOpLo = Op.getOperand(0);
SDValue ShOpHi = Op.getOperand(1);
SDValue ShAmt = Op.getOperand(2);
assert(Op.getOpcode() == ISD::SHL_PARTS);
SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
// is "undef". We wanted 0, so CSEL it directly.
SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
ISD::SETEQ, dl, DAG);
SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
LoBitsForHi =
DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
LoBitsForHi, CCVal, Cmp);
SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
DAG.getConstant(VTBits, dl, MVT::i64));
SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
SDValue HiForNormalShift =
DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);
SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
dl, DAG);
CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
HiForNormalShift, CCVal, Cmp);
// AArch64 shifts of larger than register sizes are wrapped rather than
// clamped, so we can't just emit "lo << a" if a is too big.
SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
LoForNormalShift, CCVal, Cmp);
SDValue Ops[2] = { Lo, Hi };
return DAG.getMergeValues(Ops, dl);
}
bool AArch64TargetLowering::isOffsetFoldingLegal(
const GlobalAddressSDNode *GA) const {
// The AArch64 target doesn't support folding offsets into global addresses.
return false;
}
bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
// We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
// FIXME: We should be able to handle f128 as well with a clever lowering.
if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
return true;
if (VT == MVT::f64)
return AArch64_AM::getFP64Imm(Imm) != -1;
else if (VT == MVT::f32)
return AArch64_AM::getFP32Imm(Imm) != -1;
return false;
}
//===----------------------------------------------------------------------===//
// AArch64 Optimization Hooks
//===----------------------------------------------------------------------===//
static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
SDValue Operand, SelectionDAG &DAG,
int &ExtraSteps) {
EVT VT = Operand.getValueType();
if (ST->hasNEON() &&
(VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
VT == MVT::f32 || VT == MVT::v1f32 ||
VT == MVT::v2f32 || VT == MVT::v4f32)) {
if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
// For the reciprocal estimates, convergence is quadratic, so the number
// of digits is doubled after each iteration. In ARMv8, the accuracy of
// the initial estimate is 2^-8. Thus the number of extra steps to refine
// the result for float (23 mantissa bits) is 2 and for double (52
// mantissa bits) is 3.
ExtraSteps = VT == MVT::f64 ? 3 : 2;
return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
}
return SDValue();
}
SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps,
bool &UseOneConst,
bool Reciprocal) const {
if (Enabled == ReciprocalEstimate::Enabled ||
(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
&Flags);
Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, &Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
}
if (!Reciprocal) {
EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
VT);
SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, &Flags);
// Correct the result if the operand is 0.0.
Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
VT, Eq, Operand, Estimate);
}
ExtraSteps = 0;
return Estimate;
}
return SDValue();
}
SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
SelectionDAG &DAG, int Enabled,
int &ExtraSteps) const {
if (Enabled == ReciprocalEstimate::Enabled)
if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
DAG, ExtraSteps)) {
SDLoc DL(Operand);
EVT VT = Operand.getValueType();
SDNodeFlags Flags;
Flags.setUnsafeAlgebra(true);
// Newton reciprocal iteration: E * (2 - X * E)
// AArch64 reciprocal iteration instruction: (2 - M * N)
for (int i = ExtraSteps; i > 0; --i) {
SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
Estimate, &Flags);
Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, &Flags);
}
ExtraSteps = 0;
return Estimate;
}
return SDValue();
}
//===----------------------------------------------------------------------===//
// AArch64 Inline Assembly Support
//===----------------------------------------------------------------------===//
// Table of Constraints
// TODO: This is the current set of constraints supported by ARM for the
// compiler, not all of them may make sense, e.g. S may be difficult to support.
//
// r - A general register
// w - An FP/SIMD register of some size in the range v0-v31
// x - An FP/SIMD register of some size in the range v0-v15
// I - Constant that can be used with an ADD instruction
// J - Constant that can be used with a SUB instruction
// K - Constant that can be used with a 32-bit logical instruction
// L - Constant that can be used with a 64-bit logical instruction
// M - Constant that can be used as a 32-bit MOV immediate
// N - Constant that can be used as a 64-bit MOV immediate
// Q - A memory reference with base register and no offset
// S - A symbolic address
// Y - Floating point constant zero
// Z - Integer constant zero
//
// Note that general register operands will be output using their 64-bit x
// register name, whatever the size of the variable, unless the asm operand
// is prefixed by the %w modifier. Floating-point and SIMD register operands
// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
// %q modifier.
const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
// At this point, we have to lower this constraint to something else, so we
// lower it to an "r" or "w". However, by doing this we will force the result
// to be in register, while the X constraint is much more permissive.
//
// Although we are correct (we are free to emit anything, without
// constraints), we might break use cases that would expect us to be more
// efficient and emit something else.
if (!Subtarget->hasFPARMv8())
return "r";
if (ConstraintVT.isFloatingPoint())
return "w";
if (ConstraintVT.isVector() &&
(ConstraintVT.getSizeInBits() == 64 ||
ConstraintVT.getSizeInBits() == 128))
return "w";
return "r";
}
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
AArch64TargetLowering::ConstraintType
AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
default:
break;
case 'z':
return C_Other;
case 'x':
case 'w':
return C_RegisterClass;
// An address with a single base register. Due to the way we
// currently handle addresses it is the same as 'r'.
case 'Q':
return C_Memory;
}
}
return TargetLowering::getConstraintType(Constraint);
}
/// Examine constraint type and operand type and determine a weight value.
/// This object must already have been set up with the operand type
/// and the current alternative constraint selected.
TargetLowering::ConstraintWeight
AArch64TargetLowering::getSingleConstraintMatchWeight(
AsmOperandInfo &info, const char *constraint) const {
ConstraintWeight weight = CW_Invalid;
Value *CallOperandVal = info.CallOperandVal;
// If we don't have a value, we can't do a match,
// but allow it at the lowest weight.
if (!CallOperandVal)
return CW_Default;
Type *type = CallOperandVal->getType();
// Look at the constraint type.
switch (*constraint) {
default:
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
break;
case 'x':
case 'w':
if (type->isFloatingPointTy() || type->isVectorTy())
weight = CW_Register;
break;
case 'z':
weight = CW_Constant;
break;
}
return weight;
}
std::pair<unsigned, const TargetRegisterClass *>
AArch64TargetLowering::getRegForInlineAsmConstraint(
const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
case 'r':
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::GPR64commonRegClass);
return std::make_pair(0U, &AArch64::GPR32commonRegClass);
case 'w':
if (VT.getSizeInBits() == 16)
return std::make_pair(0U, &AArch64::FPR16RegClass);
if (VT.getSizeInBits() == 32)
return std::make_pair(0U, &AArch64::FPR32RegClass);
if (VT.getSizeInBits() == 64)
return std::make_pair(0U, &AArch64::FPR64RegClass);
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128RegClass);
break;
// The instructions that this constraint is designed for can
// only take 128-bit registers so just use that regclass.
case 'x':
if (VT.getSizeInBits() == 128)
return std::make_pair(0U, &AArch64::FPR128_loRegClass);
break;
}
}
if (StringRef("{cc}").equals_lower(Constraint))
return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
// Use the default implementation in TargetLowering to convert the register
// constraint into a member of a register class.
std::pair<unsigned, const TargetRegisterClass *> Res;
Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
// Not found as a standard register?
if (!Res.second) {
unsigned Size = Constraint.size();
if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
int RegNo;
bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
if (!Failed && RegNo >= 0 && RegNo <= 31) {
// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
// By default we'll emit v0-v31 for this unless there's a modifier where
// we'll emit the correct register as well.
if (VT != MVT::Other && VT.getSizeInBits() == 64) {
Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR64RegClass;
} else {
Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
Res.second = &AArch64::FPR128RegClass;
}
}
}
}
return Res;
}
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector. If it is invalid, don't add anything to Ops.
void AArch64TargetLowering::LowerAsmOperandForConstraint(
SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
SDValue Result;
// Currently only support length 1 constraints.
if (Constraint.length() != 1)
return;
char ConstraintLetter = Constraint[0];
switch (ConstraintLetter) {
default:
break;
// This set of constraints deal with valid constants for various instructions.
// Validate and return a target constant for them if we can.
case 'z': {
// 'z' maps to xzr or wzr so it needs an input of 0.
if (!isNullConstant(Op))
return;
if (Op.getValueType() == MVT::i64)
Result = DAG.getRegister(AArch64::XZR, MVT::i64);
else
Result = DAG.getRegister(AArch64::WZR, MVT::i32);
break;
}
case 'I':
case 'J':
case 'K':
case 'L':
case 'M':
case 'N':
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
if (!C)
return;
// Grab the value and do some validation.
uint64_t CVal = C->getZExtValue();
switch (ConstraintLetter) {
// The I constraint applies only to simple ADD or SUB immediate operands:
// i.e. 0 to 4095 with optional shift by 12
// The J constraint applies only to ADD or SUB immediates that would be
// valid when negated, i.e. if [an add pattern] were to be output as a SUB
// instruction [or vice versa], in other words -1 to -4095 with optional
// left shift by 12.
case 'I':
if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
break;
return;
case 'J': {
uint64_t NVal = -C->getSExtValue();
if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
CVal = C->getSExtValue();
break;
}
return;
}
// The K and L constraints apply *only* to logical immediates, including
// what used to be the MOVI alias for ORR (though the MOVI alias has now
// been removed and MOV should be used). So these constraints have to
// distinguish between bit patterns that are valid 32-bit or 64-bit
// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
// versa.
case 'K':
if (AArch64_AM::isLogicalImmediate(CVal, 32))
break;
return;
case 'L':
if (AArch64_AM::isLogicalImmediate(CVal, 64))
break;
return;
// The M and N constraints are a superset of K and L respectively, for use
// with the MOV (immediate) alias. As well as the logical immediates they
// also match 32 or 64-bit immediates that can be loaded either using a
// *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
// (M) or 64-bit 0x1234000000000000 (N) etc.
// As a note some of this code is liberally stolen from the asm parser.
case 'M': {
if (!isUInt<32>(CVal))
return;
if (AArch64_AM::isLogicalImmediate(CVal, 32))
break;
if ((CVal & 0xFFFF) == CVal)
break;
if ((CVal & 0xFFFF0000ULL) == CVal)
break;
uint64_t NCVal = ~(uint32_t)CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
break;
if ((NCVal & 0xFFFF0000ULL) == NCVal)
break;
return;
}
case 'N': {
if (AArch64_AM::isLogicalImmediate(CVal, 64))
break;
if ((CVal & 0xFFFFULL) == CVal)
break;
if ((CVal & 0xFFFF0000ULL) == CVal)
break;
if ((CVal & 0xFFFF00000000ULL) == CVal)
break;
if ((CVal & 0xFFFF000000000000ULL) == CVal)
break;
uint64_t NCVal = ~CVal;
if ((NCVal & 0xFFFFULL) == NCVal)
break;
if ((NCVal & 0xFFFF0000ULL) == NCVal)
break;
if ((NCVal & 0xFFFF00000000ULL) == NCVal)
break;
if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
break;
return;
}
default:
return;
}
// All assembler immediates are 64-bit integers.
Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
break;
}
if (Result.getNode()) {
Ops.push_back(Result);
return;
}
return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}
//===----------------------------------------------------------------------===//
// AArch64 Advanced SIMD Support
//===----------------------------------------------------------------------===//
/// WidenVector - Given a value in the V64 register class, produce the
/// equivalent value in the V128 register class.
static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
EVT VT = V64Reg.getValueType();
unsigned NarrowSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
SDLoc DL(V64Reg);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
V64Reg, DAG.getConstant(0, DL, MVT::i32));
}
/// getExtFactor - Determine the adjustment factor for the position when
/// generating an "extract from vector registers" instruction.
static unsigned getExtFactor(SDValue &V) {
EVT EltType = V.getValueType().getVectorElementType();
return EltType.getSizeInBits() / 8;
}
/// NarrowVector - Given a value in the V128 register class, produce the
/// equivalent value in the V64 register class.
static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
EVT VT = V128Reg.getValueType();
unsigned WideSize = VT.getVectorNumElements();
MVT EltTy = VT.getVectorElementType().getSimpleVT();
MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
SDLoc DL(V128Reg);
return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
}
// Gather data to see if the operation can be modelled as a
// shuffle in combination with VEXTs.
SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
unsigned NumElts = VT.getVectorNumElements();
struct ShuffleSourceInfo {
SDValue Vec;
unsigned MinElt;
unsigned MaxElt;
// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
// be compatible with the shuffle we intend to construct. As a result
// ShuffleVec will be some sliding window into the original Vec.
SDValue ShuffleVec;
// Code should guarantee that element i in Vec starts at element "WindowBase
// + i * WindowScale in ShuffleVec".
int WindowBase;
int WindowScale;
ShuffleSourceInfo(SDValue Vec)
: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
};
// First gather all vectors used as an immediate source for this BUILD_VECTOR
// node.
SmallVector<ShuffleSourceInfo, 2> Sources;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
!isa<ConstantSDNode>(V.getOperand(1))) {
// A shuffle can only come from building a vector from various
// elements of other vectors, provided their indices are constant.
return SDValue();
}
// Add this element source to the list if it's not already there.
SDValue SourceVec = V.getOperand(0);
auto Source = find(Sources, SourceVec);
if (Source == Sources.end())
Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
// Update the minimum and maximum lane number seen.
unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
Source->MinElt = std::min(Source->MinElt, EltNo);
Source->MaxElt = std::max(Source->MaxElt, EltNo);
}
// Currently only do something sane when at most two source vectors
// are involved.
if (Sources.size() > 2)
return SDValue();
// Find out the smallest element size among result and two sources, and use
// it as element size to build the shuffle_vector.
EVT SmallestEltTy = VT.getVectorElementType();
for (auto &Source : Sources) {
EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
if (SrcEltTy.bitsLT(SmallestEltTy)) {
SmallestEltTy = SrcEltTy;
}
}
unsigned ResMultiplier =
VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
// If the source vector is too wide or too narrow, we may nevertheless be able
// to construct a compatible shuffle either by concatenating it with UNDEF or
// extracting a suitable range of elements.
for (auto &Src : Sources) {
EVT SrcVT = Src.ShuffleVec.getValueType();
if (SrcVT.getSizeInBits() == VT.getSizeInBits())
continue;
// This stage of the search produces a source with the same element type as
// the original, but with a total width matching the BUILD_VECTOR output.
EVT EltVT = SrcVT.getVectorElementType();
unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
// We can pad out the smaller vector for free, so if it's part of a
// shuffle...
Src.ShuffleVec =
DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
DAG.getUNDEF(Src.ShuffleVec.getValueType()));
continue;
}
assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());
if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
// Span too large for a VEXT to cope
return SDValue();
}
if (Src.MinElt >= NumSrcElts) {
// The extraction can just take the second half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
Src.WindowBase = -NumSrcElts;
} else if (Src.MaxElt < NumSrcElts) {
// The extraction can just take the first half
Src.ShuffleVec =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
} else {
// An actual VEXT is needed
SDValue VEXTSrc1 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(0, dl, MVT::i64));
SDValue VEXTSrc2 =
DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
DAG.getConstant(NumSrcElts, dl, MVT::i64));
unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
VEXTSrc2,
DAG.getConstant(Imm, dl, MVT::i32));
Src.WindowBase = -Src.MinElt;
}
}
// Another possible incompatibility occurs from the vector element types. We
// can fix this by bitcasting the source vectors to the same type we intend
// for the shuffle.
for (auto &Src : Sources) {
EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
if (SrcEltTy == SmallestEltTy)
continue;
assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
Src.WindowBase *= Src.WindowScale;
}
// Final sanity check before we try to actually produce a shuffle.
DEBUG(
for (auto Src : Sources)
assert(Src.ShuffleVec.getValueType() == ShuffleVT);
);
// The stars all align, our next step is to produce the mask for the shuffle.
SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
SDValue Entry = Op.getOperand(i);
if (Entry.isUndef())
continue;
auto Src = find(Sources, Entry.getOperand(0));
int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
// segment.
EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
int BitsDefined =
std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
int LanesDefined = BitsDefined / BitsPerShuffleLane;
// This source is expected to fill ResMultiplier lanes of the final shuffle,
// starting at the appropriate offset.
int *LaneMask = &Mask[i * ResMultiplier];
int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
ExtractBase += NumElts * (Src - Sources.begin());
for (int j = 0; j < LanesDefined; ++j)
LaneMask[j] = ExtractBase + j;
}
// Final check before we try to produce nonsense...
if (!isShuffleMaskLegal(Mask, ShuffleVT))
return SDValue();
SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
for (unsigned i = 0; i < Sources.size(); ++i)
ShuffleOps[i] = Sources[i].ShuffleVec;
SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
ShuffleOps[1], Mask);
return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
}
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are the same.
static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
unsigned NumElts = VT.getVectorNumElements();
// Assume that the first shuffle index is not UNDEF. Fail if it is.
if (M[0] < 0)
return false;
Imm = M[0];
// If this is a VEXT shuffle, the immediate value is the index of the first
// element. The other shuffle indices must be the successive elements after
// the first one.
unsigned ExpectedElt = Imm;
for (unsigned i = 1; i < NumElts; ++i) {
// Increment the expected index. If it wraps around, just follow it
// back to index zero and keep going.
++ExpectedElt;
if (ExpectedElt == NumElts)
ExpectedElt = 0;
if (M[i] < 0)
continue; // ignore UNDEF indices
if (ExpectedElt != static_cast<unsigned>(M[i]))
return false;
}
return true;
}
// check if an EXT instruction can handle the shuffle mask when the
// vector sources of the shuffle are different.
static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
unsigned &Imm) {
// Look for the first non-undef element.
const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
// Benefit form APInt to handle overflow when calculating expected element.
unsigned NumElts = VT.getVectorNumElements();
unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
// The following shuffle indices must be the successive elements after the
// first real element.
const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
if (FirstWrongElt != M.end())
return false;
// The index of an EXT is the first element if it is not UNDEF.
// Watch out for the beginning UNDEFs. The EXT index should be the expected
// value of the first element. E.g.
// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
// <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
// ExpectedElt is the last mask index plus 1.
Imm = ExpectedElt.getZExtValue();
// There are two difference cases requiring to reverse input vectors.
// For example, for vector <4 x i32> we have the following cases,
// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
// For both cases, we finally use mask <5, 6, 7, 0>, which requires
// to reverse two input vectors.
if (Imm < NumElts)
ReverseEXT = true;
else
Imm -= NumElts;
return true;
}
/// isREVMask - Check if a vector shuffle corresponds to a REV
/// instruction with the specified blocksize. (The order of the elements
/// within each block of the vector is reversed.)
static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
"Only possible block sizes for REV are: 16, 32, 64");
unsigned EltSz = VT.getScalarSizeInBits();
if (EltSz == 64)
return false;
unsigned NumElts = VT.getVectorNumElements();
unsigned BlockElts = M[0] + 1;
// If the first shuffle index is UNDEF, be optimistic.
if (M[0] < 0)
BlockElts = BlockSize / EltSz;
if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
return false;
for (unsigned i = 0; i < NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
return false;
}
return true;
}
static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
return false;
Idx += 1;
}
return true;
}
static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i != NumElts; ++i) {
if (M[i] < 0)
continue; // ignore UNDEF indices
if ((unsigned)M[i] != 2 * i + WhichResult)
return false;
}
return true;
}
static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
return false;
}
return true;
}
/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
unsigned Idx = WhichResult * NumElts / 2;
for (unsigned i = 0; i != NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
return false;
Idx += 1;
}
return true;
}
/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned Half = VT.getVectorNumElements() / 2;
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned j = 0; j != 2; ++j) {
unsigned Idx = WhichResult;
for (unsigned i = 0; i != Half; ++i) {
int MIdx = M[i + j * Half];
if (MIdx >= 0 && (unsigned)MIdx != Idx)
return false;
Idx += 2;
}
}
return true;
}
/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
unsigned NumElts = VT.getVectorNumElements();
WhichResult = (M[0] == 0 ? 0 : 1);
for (unsigned i = 0; i < NumElts; i += 2) {
if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
return false;
}
return true;
}
static bool isINSMask(ArrayRef<int> M, int NumInputElements,
bool &DstIsLeft, int &Anomaly) {
if (M.size() != static_cast<size_t>(NumInputElements))
return false;
int NumLHSMatch = 0, NumRHSMatch = 0;
int LastLHSMismatch = -1, LastRHSMismatch = -1;
for (int i = 0; i < NumInputElements; ++i) {
if (M[i] == -1) {
++NumLHSMatch;
++NumRHSMatch;
continue;
}
if (M[i] == i)
++NumLHSMatch;
else
LastLHSMismatch = i;
if (M[i] == i + NumInputElements)
++NumRHSMatch;
else
LastRHSMismatch = i;
}
if (NumLHSMatch == NumInputElements - 1) {
DstIsLeft = true;
Anomaly = LastLHSMismatch;
return true;
} else if (NumRHSMatch == NumInputElements - 1) {
DstIsLeft = false;
Anomaly = LastRHSMismatch;
return true;
}
return false;
}
static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
if (VT.getSizeInBits() != 128)
return false;
unsigned NumElts = VT.getVectorNumElements();
for (int I = 0, E = NumElts / 2; I != E; I++) {
if (Mask[I] != I)
return false;
}
int Offset = NumElts / 2;
for (int I = NumElts / 2, E = NumElts; I != E; I++) {
if (Mask[I] != I + SplitLHS * Offset)
return false;
}
return true;
}
static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
SDLoc DL(Op);
EVT VT = Op.getValueType();
SDValue V0 = Op.getOperand(0);
SDValue V1 = Op.getOperand(1);
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
VT.getVectorElementType() != V1.getValueType().getVectorElementType())
return SDValue();
bool SplitV0 = V0.getValueSizeInBits() == 128;
if (!isConcatMask(Mask, VT, SplitV0))
return SDValue();
EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
VT.getVectorNumElements() / 2);
if (SplitV0) {
V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
DAG.getConstant(0, DL, MVT::i64));
}
if (V1.getValueSizeInBits() == 128) {
V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
DAG.getConstant(0, DL, MVT::i64));
}
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
}
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
/// the specified operations to build the shuffle.
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
SDValue RHS, SelectionDAG &DAG,
const SDLoc &dl) {
unsigned OpNum = (PFEntry >> 26) & 0x0F;
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
enum {
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
OP_VREV,
OP_VDUP0,
OP_VDUP1,
OP_VDUP2,
OP_VDUP3,
OP_VEXT1,
OP_VEXT2,
OP_VEXT3,
OP_VUZPL, // VUZP, left result
OP_VUZPR, // VUZP, right result
OP_VZIPL, // VZIP, left result
OP_VZIPR, // VZIP, right result
OP_VTRNL, // VTRN, left result
OP_VTRNR // VTRN, right result
};
if (OpNum == OP_COPY) {
if (LHSID == (1 * 9 + 2) * 9 + 3)
return LHS;
assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
return RHS;
}
SDValue OpLHS, OpRHS;
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
EVT VT = OpLHS.getValueType();
switch (OpNum) {
default:
llvm_unreachable("Unknown shuffle opcode!");
case OP_VREV:
// VREV divides the vector in half and swaps within the half.
if (VT.getVectorElementType() == MVT::i32 ||
VT.getVectorElementType() == MVT::f32)
return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
// vrev <4 x i16> -> REV32
if (VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::f16)
return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
// vrev <4 x i8> -> REV16
assert(VT.getVectorElementType() == MVT::i8);
return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
case OP_VDUP0:
case OP_VDUP1:
case OP_VDUP2:
case OP_VDUP3: {
EVT EltTy = VT.getVectorElementType();
unsigned Opcode;
if (EltTy == MVT::i8)
Opcode = AArch64ISD::DUPLANE8;
else if (EltTy == MVT::i16 || EltTy == MVT::f16)
Opcode = AArch64ISD::DUPLANE16;
else if (EltTy == MVT::i32 || EltTy == MVT::f32)
Opcode = AArch64ISD::DUPLANE32;
else if (EltTy == MVT::i64 || EltTy == MVT::f64)
Opcode = AArch64ISD::DUPLANE64;
else
llvm_unreachable("Invalid vector element type?");
if (VT.getSizeInBits() == 64)
OpLHS = WidenVector(OpLHS, DAG);
SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
}
case OP_VEXT1:
case OP_VEXT2:
case OP_VEXT3: {
unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
DAG.getConstant(Imm, dl, MVT::i32));
}
case OP_VUZPL:
return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VUZPR:
return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VZIPL:
return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VZIPR:
return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VTRNL:
return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
case OP_VTRNR:
return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
OpRHS);
}
}
static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
SelectionDAG &DAG) {
// Check to see if we can use the TBL instruction.
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
SDLoc DL(Op);
EVT EltVT = Op.getValueType().getVectorElementType();
unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
SmallVector<SDValue, 8> TBLMask;
for (int Val : ShuffleMask) {
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
}
}
MVT IndexVT = MVT::v8i8;
unsigned IndexLen = 8;
if (Op.getValueSizeInBits() == 128) {
IndexVT = MVT::v16i8;
IndexLen = 16;
}
SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
SDValue Shuffle;
if (V2.getNode()->isUndef()) {
if (IndexLen == 8)
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
} else {
if (IndexLen == 8) {
V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
} else {
// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
// cannot currently represent the register constraints on the input
// table registers.
// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
// IndexLen));
Shuffle = DAG.getNode(
ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
V2Cst, DAG.getBuildVector(IndexVT, DL,
makeArrayRef(TBLMask.data(), IndexLen)));
}
}
return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
}
static unsigned getDUPLANEOp(EVT EltType) {
if (EltType == MVT::i8)
return AArch64ISD::DUPLANE8;
if (EltType == MVT::i16 || EltType == MVT::f16)
return AArch64ISD::DUPLANE16;
if (EltType == MVT::i32 || EltType == MVT::f32)
return AArch64ISD::DUPLANE32;
if (EltType == MVT::i64 || EltType == MVT::f64)
return AArch64ISD::DUPLANE64;
llvm_unreachable("Invalid vector element type?");
}
SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
// Convert shuffles that are directly supported on NEON to target-specific
// DAG nodes, instead of keeping them as shuffles and matching them again
// during code selection. This is more efficient and avoids the possibility
// of inconsistencies between legalization and selection.
ArrayRef<int> ShuffleMask = SVN->getMask();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
if (SVN->isSplat()) {
int Lane = SVN->getSplatIndex();
// If this is undef splat, generate it via "just" vdup, if possible.
if (Lane == -1)
Lane = 0;
if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
V1.getOperand(0));
// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
// constant. If so, we can just reference the lane's definition directly.
if (V1.getOpcode() == ISD::BUILD_VECTOR &&
!isa<ConstantSDNode>(V1.getOperand(Lane)))
return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
// to make a vector of the same size as this SHUFFLE. We can ignore the
// extract entirely, and canonicalise the concat using WidenVector.
if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
V1 = V1.getOperand(0);
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
Lane -= Idx * VT.getVectorNumElements() / 2;
V1 = WidenVector(V1.getOperand(Idx), DAG);
} else if (VT.getSizeInBits() == 64)
V1 = WidenVector(V1, DAG);
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
}
if (isREVMask(ShuffleMask, VT, 64))
return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 32))
return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
bool ReverseEXT = false;
unsigned Imm;
if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
if (ReverseEXT)
std::swap(V1, V2);
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
DAG.getConstant(Imm, dl, MVT::i32));
} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
Imm *= getExtFactor(V1);
return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
DAG.getConstant(Imm, dl, MVT::i32));
}
unsigned WhichResult;
if (isZIPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isUZPMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isTRNMask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
}
if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
}
if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
return Concat;
bool DstIsLeft;
int Anomaly;
int NumInputElements = V1.getValueType().getVectorNumElements();
if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
SDValue DstVec = DstIsLeft ? V1 : V2;
SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
SDValue SrcVec = V1;
int SrcLane = ShuffleMask[Anomaly];
if (SrcLane >= NumInputElements) {
SrcVec = V2;
SrcLane -= VT.getVectorNumElements();
}
SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
EVT ScalarVT = VT.getVectorElementType();
if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
ScalarVT = MVT::i32;
return DAG.getNode(
ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
DstLaneV);
}
// If the shuffle is not directly supported and it has 4 elements, use
// the PerfectShuffle-generated table to synthesize it from other shuffles.
unsigned NumElts = VT.getVectorNumElements();
if (NumElts == 4) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (ShuffleMask[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = ShuffleMask[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
}
return GenerateTBL(Op, ShuffleMask, DAG);
}
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
APInt &UndefBits) {
EVT VT = BVN->getValueType(0);
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
for (unsigned i = 0; i < NumSplats; ++i) {
CnstBits <<= SplatBitSize;
UndefBits <<= SplatBitSize;
CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
}
return true;
}
return false;
}
SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
SelectionDAG &DAG) const {
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
SDValue LHS = Op.getOperand(0);
SDLoc dl(Op);
EVT VT = Op.getValueType();
if (!BVN)
return Op;
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
// We only have BIC vector immediate instruction, which is and-not.
CnstBits = ~CnstBits;
// We make use of a little bit of goto ickiness in order to avoid having to
// duplicate the immediate matching logic for the undef toggled case.
bool SecondTry = false;
AttemptModImm:
if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
CnstBits = CnstBits.zextOrTrunc(64);
uint64_t CnstVal = CnstBits.getZExtValue();
if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(16, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(24, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
if (SecondTry)
goto FailedModImm;
SecondTry = true;
CnstBits = ~UndefBits;
goto AttemptModImm;
}
// We can always fall back to a non-immediate AND.
FailedModImm:
return Op;
}
// Specialized code to quickly find if PotentialBVec is a BuildVector that
// consists of only the same constant int value, returned in reference arg
// ConstVal
static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
uint64_t &ConstVal) {
BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
if (!Bvec)
return false;
ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
if (!FirstElt)
return false;
EVT VT = Bvec->getValueType(0);
unsigned NumElts = VT.getVectorNumElements();
for (unsigned i = 1; i < NumElts; ++i)
if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
return false;
ConstVal = FirstElt->getZExtValue();
return true;
}
static unsigned getIntrinsicID(const SDNode *N) {
unsigned Opcode = N->getOpcode();
switch (Opcode) {
default:
return Intrinsic::not_intrinsic;
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
if (IID < Intrinsic::num_intrinsics)
return IID;
return Intrinsic::not_intrinsic;
}
}
}
// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
// Also, logical shift right -> sri, with the same structure.
static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
SDLoc DL(N);
// Is the first op an AND?
const SDValue And = N->getOperand(0);
if (And.getOpcode() != ISD::AND)
return SDValue();
// Is the second op an shl or lshr?
SDValue Shift = N->getOperand(1);
// This will have been turned into: AArch64ISD::VSHL vector, #shift
// or AArch64ISD::VLSHR vector, #shift
unsigned ShiftOpc = Shift.getOpcode();
if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
return SDValue();
bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
// Is the shift amount constant?
ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
if (!C2node)
return SDValue();
// Is the and mask vector all constant?
uint64_t C1;
if (!isAllConstantBuildVector(And.getOperand(1), C1))
return SDValue();
// Is C1 == ~C2, taking into account how much one can shift elements of a
// particular size?
uint64_t C2 = C2node->getZExtValue();
unsigned ElemSizeInBits = VT.getScalarSizeInBits();
if (C2 > ElemSizeInBits)
return SDValue();
unsigned ElemMask = (1 << ElemSizeInBits) - 1;
if ((C1 & ElemMask) != (~C2 & ElemMask))
return SDValue();
SDValue X = And.getOperand(0);
SDValue Y = Shift.getOperand(0);
unsigned Intrin =
IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
SDValue ResultSLI =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
Shift.getOperand(1));
DEBUG(dbgs() << "aarch64-lower: transformed: \n");
DEBUG(N->dump(&DAG));
DEBUG(dbgs() << "into: \n");
DEBUG(ResultSLI->dump(&DAG));
++NumShiftInserts;
return ResultSLI;
}
SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
SelectionDAG &DAG) const {
// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
if (EnableAArch64SlrGeneration) {
if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
return Res;
}
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
SDValue LHS = Op.getOperand(1);
SDLoc dl(Op);
EVT VT = Op.getValueType();
// OR commutes, so try swapping the operands.
if (!BVN) {
LHS = Op.getOperand(0);
BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
}
if (!BVN)
return Op;
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
// We make use of a little bit of goto ickiness in order to avoid having to
// duplicate the immediate matching logic for the undef toggled case.
bool SecondTry = false;
AttemptModImm:
if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
CnstBits = CnstBits.zextOrTrunc(64);
uint64_t CnstVal = CnstBits.getZExtValue();
if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(16, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(24, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
if (SecondTry)
goto FailedModImm;
SecondTry = true;
CnstBits = UndefBits;
goto AttemptModImm;
}
// We can always fall back to a non-immediate OR.
FailedModImm:
return Op;
}
// Normalize the operands of BUILD_VECTOR. The value of constant operands will
// be truncated to fit element width.
static SDValue NormalizeBuildVector(SDValue Op,
SelectionDAG &DAG) {
assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
SDLoc dl(Op);
EVT VT = Op.getValueType();
EVT EltTy= VT.getVectorElementType();
if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
return Op;
SmallVector<SDValue, 16> Ops;
for (SDValue Lane : Op->ops()) {
if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
APInt LowBits(EltTy.getSizeInBits(),
CstLane->getZExtValue());
Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
}
Ops.push_back(Lane);
}
return DAG.getBuildVector(VT, dl, Ops);
}
SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {
SDLoc dl(Op);
EVT VT = Op.getValueType();
Op = NormalizeBuildVector(Op, DAG);
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
// We make use of a little bit of goto ickiness in order to avoid having to
// duplicate the immediate matching logic for the undef toggled case.
bool SecondTry = false;
AttemptModImm:
if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
CnstBits = CnstBits.zextOrTrunc(64);
uint64_t CnstVal = CnstBits.getZExtValue();
// Certain magic vector constants (used to express things like NOT
// and NEG) are passed through unmodified. This allows codegen patterns
// for these operations to match. Special-purpose patterns will lower
// these immediates to MOVIs if it proves necessary.
if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
return Op;
// The many faces of MOVI...
if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
if (VT.getSizeInBits() == 128) {
SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
DAG.getConstant(CnstVal, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// Support the V64 version via subregister insertion.
SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
DAG.getConstant(CnstVal, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(16, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(24, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(264, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(272, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// The few faces of FMOV...
if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
VT.getSizeInBits() == 128) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
DAG.getConstant(CnstVal, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
// The many faces of MVNI...
CnstVal = ~CnstVal;
if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(16, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(24, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(0, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(8, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(264, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
DAG.getConstant(CnstVal, dl, MVT::i32),
DAG.getConstant(272, dl, MVT::i32));
return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
}
}
if (SecondTry)
goto FailedModImm;
SecondTry = true;
CnstBits = UndefBits;
goto AttemptModImm;
}
FailedModImm:
// Scan through the operands to find some interesting properties we can
// exploit:
// 1) If only one value is used, we can use a DUP, or
// 2) if only the low element is not undef, we can just insert that, or
// 3) if only one constant value is used (w/ some non-constant lanes),
// we can splat the constant value into the whole vector then fill
// in the non-constant lanes.
// 4) FIXME: If different constant values are used, but we can intelligently
// select the values we'll be overwriting for the non-constant
// lanes such that we can directly materialize the vector
// some other way (MOVI, e.g.), we can be sneaky.
unsigned NumElts = VT.getVectorNumElements();
bool isOnlyLowElement = true;
bool usesOnlyOneValue = true;
bool usesOnlyOneConstantValue = true;
bool isConstant = true;
unsigned NumConstantLanes = 0;
SDValue Value;
SDValue ConstantValue;
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
if (i > 0)
isOnlyLowElement = false;
if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
isConstant = false;
if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
++NumConstantLanes;
if (!ConstantValue.getNode())
ConstantValue = V;
else if (ConstantValue != V)
usesOnlyOneConstantValue = false;
}
if (!Value.getNode())
Value = V;
else if (V != Value)
usesOnlyOneValue = false;
}
if (!Value.getNode())
return DAG.getUNDEF(VT);
if (isOnlyLowElement)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
// Use DUP for non-constant splats. For f32 constant splats, reduce to
// i32 and try again.
if (usesOnlyOneValue) {
if (!isConstant) {
if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
Value.getValueType() != VT)
return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
// This is actually a DUPLANExx operation, which keeps everything vectory.
// DUPLANE works on 128-bit vectors, widen it if necessary.
SDValue Lane = Value.getOperand(1);
Value = Value.getOperand(0);
if (Value.getValueSizeInBits() == 64)
Value = WidenVector(Value, DAG);
unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
return DAG.getNode(Opcode, dl, VT, Value, Lane);
}
if (VT.getVectorElementType().isFloatingPoint()) {
SmallVector<SDValue, 8> Ops;
EVT EltTy = VT.getVectorElementType();
assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
"Unsupported floating-point vector type");
MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
for (unsigned i = 0; i < NumElts; ++i)
Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
Val = LowerBUILD_VECTOR(Val, DAG);
if (Val.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, Val);
}
}
// If there was only one constant value used and for more than one lane,
// start by splatting that value, then replace the non-constant lanes. This
// is better than the default, which will perform a separate initialization
// for each lane.
if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
// Now insert the non-constant lanes.
for (unsigned i = 0; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
// Note that type legalization likely mucked about with the VT of the
// source operand, so we may have to convert it here before inserting.
Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
}
}
return Val;
}
// If all elements are constants and the case above didn't get hit, fall back
// to the default expansion, which will generate a load from the constant
// pool.
if (isConstant)
return SDValue();
// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
if (NumElts >= 4) {
if (SDValue shuffle = ReconstructShuffle(Op, DAG))
return shuffle;
}
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
// scalar_to_vector for the elements followed by a shuffle (provided the
// shuffle is valid for the target) and materialization element by element
// on the stack followed by a load for everything else.
if (!isConstant && !usesOnlyOneValue) {
SDValue Vec = DAG.getUNDEF(VT);
SDValue Op0 = Op.getOperand(0);
unsigned ElemSize = VT.getScalarSizeInBits();
unsigned i = 0;
// For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
// a) Avoid a RMW dependency on the full vector register, and
// b) Allow the register coalescer to fold away the copy if the
// value is already in an S or D register.
// Do not do this for UNDEF/LOAD nodes because we have better patterns
// for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
(ElemSize == 32 || ElemSize == 64)) {
unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
MachineSDNode *N =
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
DAG.getTargetConstant(SubIdx, dl, MVT::i32));
Vec = SDValue(N, 0);
++i;
}
for (; i < NumElts; ++i) {
SDValue V = Op.getOperand(i);
if (V.isUndef())
continue;
SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
}
return Vec;
}
// Just use the default expansion. We failed to find a better alternative.
return SDValue();
}
SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform insertion by expanding the value
// to a V128 type and perform the insertion on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
Op.getOperand(1), Op.getOperand(2));
// Re-narrow the resultant vector.
return NarrowVector(Node, DAG);
}
SDValue
AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
// Check for non-constant or out of range lane.
EVT VT = Op.getOperand(0).getValueType();
ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
return SDValue();
// Insertion/extraction are legal for V128 types.
if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
VT == MVT::v8f16)
return Op;
if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
return SDValue();
// For V64 types, we perform extraction by expanding the value
// to a V128 type and perform the extraction on that.
SDLoc DL(Op);
SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
EVT WideTy = WideVec.getValueType();
EVT ExtrTy = WideTy.getVectorElementType();
if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
ExtrTy = MVT::i32;
// For extractions, we just return the result directly.
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
Op.getOperand(1));
}
SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getOperand(0).getValueType();
SDLoc dl(Op);
// Just in case...
if (!VT.isVector())
return SDValue();
ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
if (!Cst)
return SDValue();
unsigned Val = Cst->getZExtValue();
unsigned Size = Op.getValueSizeInBits();
// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
if (Val == 0)
return Op;
// If this is extracting the upper 64-bits of a 128-bit vector, we match
// that directly.
if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
return Op;
return SDValue();
}
bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
EVT VT) const {
if (VT.getVectorNumElements() == 4 &&
(VT.is128BitVector() || VT.is64BitVector())) {
unsigned PFIndexes[4];
for (unsigned i = 0; i != 4; ++i) {
if (M[i] < 0)
PFIndexes[i] = 8;
else
PFIndexes[i] = M[i];
}
// Compute the index in the perfect shuffle table.
unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
PFIndexes[2] * 9 + PFIndexes[3];
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
unsigned Cost = (PFEntry >> 30);
if (Cost <= 4)
return true;
}
bool DummyBool;
int DummyInt;
unsigned DummyUnsigned;
return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
// isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
isZIPMask(M, VT, DummyUnsigned) ||
isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
isConcatMask(M, VT, VT.getSizeInBits() == 128));
}
/// getVShiftImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift operation, where all the elements of the
/// build_vector must have the same constant integer value.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
// Ignore bit_converts.
while (Op.getOpcode() == ISD::BITCAST)
Op = Op.getOperand(0);
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
APInt SplatBits, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
HasAnyUndefs, ElementBits) ||
SplatBitSize > ElementBits)
return false;
Cnt = SplatBits.getSExtValue();
return true;
}
/// isVShiftLImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift left operation. That value must be in the range:
/// 0 <= Value < ElementBits for a left shift; or
/// 0 <= Value <= ElementBits for a long left shift.
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
}
/// isVShiftRImm - Check if this is a valid build_vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift; or
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
assert(VT.isVector() && "vector shift count is not a vector type");
int64_t ElementBits = VT.getScalarSizeInBits();
if (!getVShiftImm(Op, ElementBits, Cnt))
return false;
return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
}
SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc DL(Op);
int64_t Cnt;
if (!Op.getOperand(1).getValueType().isVector())
return Op;
unsigned EltSize = VT.getScalarSizeInBits();
switch (Op.getOpcode()) {
default:
llvm_unreachable("unexpected shift opcode");
case ISD::SHL:
if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
MVT::i32),
Op.getOperand(0), Op.getOperand(1));
case ISD::SRA:
case ISD::SRL:
// Right shift immediate
if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
unsigned Opc =
(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
DAG.getConstant(Cnt, DL, MVT::i32));
}
// Right shift register. Note, there is not a shift right register
// instruction, but the shift left register instruction takes a signed
// value, where negative numbers specify a right shift.
unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
: Intrinsic::aarch64_neon_ushl;
// negate the shift amount
SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
SDValue NegShiftLeft =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
NegShift);
return NegShiftLeft;
}
return SDValue();
}
static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
AArch64CC::CondCode CC, bool NoNans, EVT VT,
const SDLoc &dl, SelectionDAG &DAG) {
EVT SrcVT = LHS.getValueType();
assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
"function only supposed to emit natural comparisons");
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
APInt CnstBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
bool IsZero = IsCnst && (CnstBits == 0);
if (SrcVT.getVectorElementType().isFloatingPoint()) {
switch (CC) {
default:
return SDValue();
case AArch64CC::NE: {
SDValue Fcmeq;
if (IsZero)
Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
else
Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
}
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
case AArch64CC::LS:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (!NoNans)
return SDValue();
// If we ignore NaNs then we can use to the MI implementation.
LLVM_FALLTHROUGH;
case AArch64CC::MI:
if (IsZero)
return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
}
}
switch (CC) {
default:
return SDValue();
case AArch64CC::NE: {
SDValue Cmeq;
if (IsZero)
Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
else
Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
}
case AArch64CC::EQ:
if (IsZero)
return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
case AArch64CC::GE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
case AArch64CC::GT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
case AArch64CC::LE:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
case AArch64CC::LS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
case AArch64CC::LO:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
case AArch64CC::LT:
if (IsZero)
return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
case AArch64CC::HI:
return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
case AArch64CC::HS:
return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
}
}
SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SDValue LHS = Op.getOperand(0);
SDValue RHS = Op.getOperand(1);
EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
SDLoc dl(Op);
if (LHS.getValueType().getVectorElementType().isInteger()) {
assert(LHS.getValueType() == RHS.getValueType());
AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
SDValue Cmp =
EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
}
if (LHS.getValueType().getVectorElementType() == MVT::f16)
return SDValue();
assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
LHS.getValueType().getVectorElementType() == MVT::f64);
// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
// clean. Some of them require two branches to implement.
AArch64CC::CondCode CC1, CC2;
bool ShouldInvert;
changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
SDValue Cmp =
EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
if (!Cmp.getNode())
return SDValue();
if (CC2 != AArch64CC::AL) {
SDValue Cmp2 =
EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
if (!Cmp2.getNode())
return SDValue();
Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
}
Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
if (ShouldInvert)
return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
return Cmp;
}
/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
/// specified in the intrinsic calls.
bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
unsigned Intrinsic) const {
auto &DL = I.getModule()->getDataLayout();
switch (Intrinsic) {
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
Info.align = 0;
Info.vol = false; // volatile loads with NEON intrinsics not supported
Info.readMem = true;
Info.writeMem = false;
return true;
}
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane: {
Info.opc = ISD::INTRINSIC_VOID;
// Conservatively set memVT to the entire set of vectors stored.
unsigned NumElts = 0;
for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
Type *ArgTy = I.getArgOperand(ArgI)->getType();
if (!ArgTy->isVectorTy())
break;
NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
}
Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
Info.offset = 0;
Info.align = 0;
Info.vol = false; // volatile stores with NEON intrinsics not supported
Info.readMem = false;
Info.writeMem = true;
return true;
}
case Intrinsic::aarch64_ldaxr:
case Intrinsic::aarch64_ldxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.vol = true;
Info.readMem = true;
Info.writeMem = false;
return true;
}
case Intrinsic::aarch64_stlxr:
case Intrinsic::aarch64_stxr: {
PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(PtrTy->getElementType());
Info.ptrVal = I.getArgOperand(1);
Info.offset = 0;
Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
Info.vol = true;
Info.readMem = false;
Info.writeMem = true;
return true;
}
case Intrinsic::aarch64_ldaxp:
case Intrinsic::aarch64_ldxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.align = 16;
Info.vol = true;
Info.readMem = true;
Info.writeMem = false;
return true;
case Intrinsic::aarch64_stlxp:
case Intrinsic::aarch64_stxp:
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::i128;
Info.ptrVal = I.getArgOperand(2);
Info.offset = 0;
Info.align = 16;
Info.vol = true;
Info.readMem = false;
Info.writeMem = true;
return true;
default:
break;
}
return false;
}
// Truncations from 64-bit GPR to 32-bit GPR is free.
bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 > NumBits2;
}
bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 > NumBits2;
}
/// Check if it is profitable to hoist instruction in then/else to if.
/// Not profitable if I and it's user can form a FMA instruction
/// because we prefer FMSUB/FMADD.
bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
if (I->getOpcode() != Instruction::FMul)
return true;
if (I->getNumUses() != 1)
return true;
Instruction *User = I->user_back();
if (User &&
!(User->getOpcode() == Instruction::FSub ||
User->getOpcode() == Instruction::FAdd))
return true;
const TargetOptions &Options = getTargetMachine().Options;
const DataLayout &DL = I->getModule()->getDataLayout();
EVT VT = getValueType(DL, User->getOperand(0)->getType());
return !(isFMAFasterThanFMulAndFAdd(VT) &&
isOperationLegalOrCustom(ISD::FMA, VT) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath));
}
// All 32-bit GPR operations implicitly zero the high-half of the corresponding
// 64-bit GPR.
bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
}
bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
return NumBits1 == 32 && NumBits2 == 64;
}
bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
EVT VT1 = Val.getValueType();
if (isZExtFree(VT1, VT2)) {
return true;
}
if (Val.getOpcode() != ISD::LOAD)
return false;
// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
VT1.getSizeInBits() <= 32);
}
bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
if (isa<FPExtInst>(Ext))
return false;
// Vector types are next free.
if (Ext->getType()->isVectorTy())
return false;
for (const Use &U : Ext->uses()) {
// The extension is free if we can fold it with a left shift in an
// addressing mode or an arithmetic operation: add, sub, and cmp.
// Is there a shift?
const Instruction *Instr = cast<Instruction>(U.getUser());
// Is this a constant shift?
switch (Instr->getOpcode()) {
case Instruction::Shl:
if (!isa<ConstantInt>(Instr->getOperand(1)))
return false;
break;
case Instruction::GetElementPtr: {
gep_type_iterator GTI = gep_type_begin(Instr);
auto &DL = Ext->getModule()->getDataLayout();
std::advance(GTI, U.getOperandNo()-1);
Type *IdxTy = GTI.getIndexedType();
// This extension will end up with a shift because of the scaling factor.
// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
// Get the shift amount based on the scaling factor:
// log2(sizeof(IdxTy)) - log2(8).
uint64_t ShiftAmt =
countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
// Is the constant foldable in the shift of the addressing mode?
// I.e., shift amount is between 1 and 4 inclusive.
if (ShiftAmt == 0 || ShiftAmt > 4)
return false;
break;
}
case Instruction::Trunc:
// Check if this is a noop.
// trunc(sext ty1 to ty2) to ty1.
if (Instr->getType() == Ext->getOperand(0)->getType())
continue;
LLVM_FALLTHROUGH;
default:
return false;
}
// At this point we can use the bfm family, so this extension is free
// for that use.
}
return true;
}
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType.isSimple() ||
(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
return false;
// Cyclone supports unaligned accesses.
RequiredAligment = 0;
unsigned NumBits = LoadedType.getSizeInBits();
return NumBits == 32 || NumBits == 64;
}
/// \brief Lower an interleaved load into a ldN intrinsic.
///
/// E.g. Lower an interleaved load (Factor = 2):
/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
///
/// Into:
/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
const DataLayout &DL = LI->getModule()->getDataLayout();
VectorType *VecTy = Shuffles[0]->getType();
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
// Skip if we do not have NEON and skip illegal vector types.
if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
return false;
// A pointer vector can not be the return type of the ldN intrinsics. Need to
// load integer vectors first and then convert to pointer vectors.
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isPointerTy())
VecTy =
VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
Type *Tys[2] = {VecTy, PtrTy};
static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
Intrinsic::aarch64_neon_ld3,
Intrinsic::aarch64_neon_ld4};
Function *LdNFunc =
Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
IRBuilder<> Builder(LI);
Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
// Replace uses of each shufflevector with the corresponding vector loaded
// by ldN.
for (unsigned i = 0; i < Shuffles.size(); i++) {
ShuffleVectorInst *SVI = Shuffles[i];
unsigned Index = Indices[i];
Value *SubVec = Builder.CreateExtractValue(LdN, Index);
// Convert the integer vector to pointer vector if the element is pointer.
if (EltTy->isPointerTy())
SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
SVI->replaceAllUsesWith(SubVec);
}
return true;
}
/// \brief Get a mask consisting of sequential integers starting from \p Start.
///
/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
unsigned NumElts) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < NumElts; i++)
Mask.push_back(Builder.getInt32(Start + i));
return ConstantVector::get(Mask);
}
/// \brief Lower an interleaved store into a stN intrinsic.
///
/// E.g. Lower an interleaved store (Factor = 3):
/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
///
/// Note that the new shufflevectors will be removed and we'll only generate one
/// st3 instruction in CodeGen.
///
/// Example for a more general valid mask (Factor 3). Lower:
/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
/// store <12 x i32> %i.vec, <12 x i32>* %ptr
///
/// Into:
/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
VectorType *VecTy = SVI->getType();
assert(VecTy->getVectorNumElements() % Factor == 0 &&
"Invalid interleaved store");
unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
Type *EltTy = VecTy->getVectorElementType();
VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
const DataLayout &DL = SI->getModule()->getDataLayout();
unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
// Skip if we do not have NEON and skip illegal vector types.
if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
return false;
Value *Op0 = SVI->getOperand(0);
Value *Op1 = SVI->getOperand(1);
IRBuilder<> Builder(SI);
// StN intrinsics don't support pointer vectors as arguments. Convert pointer
// vectors to integer vectors.
if (EltTy->isPointerTy()) {
Type *IntTy = DL.getIntPtrType(EltTy);
unsigned NumOpElts =
dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();
// Convert to the corresponding integer vector.
Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
SubVecTy = VectorType::get(IntTy, LaneLen);
}
Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
Type *Tys[2] = {SubVecTy, PtrTy};
static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
Intrinsic::aarch64_neon_st3,
Intrinsic::aarch64_neon_st4};
Function *StNFunc =
Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
SmallVector<Value *, 5> Ops;
// Split the shufflevector operands into sub vectors for the new stN call.
auto Mask = SVI->getShuffleMask();
for (unsigned i = 0; i < Factor; i++) {
if (Mask[i] >= 0) {
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
} else {
unsigned StartMask = 0;
for (unsigned j = 1; j < LaneLen; j++) {
if (Mask[j*Factor + i] >= 0) {
StartMask = Mask[j*Factor + i] - j;
break;
}
}
// Note: If all elements in a chunk are undefs, StartMask=0!
// Note: Filling undef gaps with random elements is ok, since
// those elements were being written anyway (with undefs).
// In the case of all undefs we're defaulting to using elems from 0
// Note: StartMask cannot be negative, it's checked in isReInterleaveMask
Ops.push_back(Builder.CreateShuffleVector(
Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
}
}
Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
Builder.CreateCall(StNFunc, Ops);
return true;
}
static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
unsigned AlignCheck) {
return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
(DstAlign == 0 || DstAlign % AlignCheck == 0));
}
EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
unsigned SrcAlign, bool IsMemset,
bool ZeroMemset,
bool MemcpyStrSrc,
MachineFunction &MF) const {
// Don't use AdvSIMD to implement 16-byte memset. It would have taken one
// instruction to materialize the v2i64 zero and one store (with restrictive
// addressing mode). Just do two i64 store of zero-registers.
bool Fast;
const Function *F = MF.getFunction();
if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
!F->hasFnAttribute(Attribute::NoImplicitFloat) &&
(memOpAlign(SrcAlign, DstAlign, 16) ||
(allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
return MVT::f128;
if (Size >= 8 &&
(memOpAlign(SrcAlign, DstAlign, 8) ||
(allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
return MVT::i64;
if (Size >= 4 &&
(memOpAlign(SrcAlign, DstAlign, 4) ||
(allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
return MVT::i32;
return MVT::Other;
}
// 12-bit optionally shifted immediates are legal for adds.
bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
// Avoid UB for INT64_MIN.
if (Immed == std::numeric_limits<int64_t>::min())
return false;
// Same encoding for add/sub, just flip the sign.
Immed = std::abs(Immed);
return ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
}
// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
// immediates is the same as for an add or a sub.
bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
return isLegalAddImmediate(Immed);
}
/// isLegalAddressingMode - Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// AArch64 has five basic addressing modes:
// reg
// reg + 9-bit signed offset
// reg + SIZE_IN_BYTES * 12-bit unsigned offset
// reg1 + reg2
// reg + SIZE_IN_BYTES * reg
// No global is ever allowed as a base.
if (AM.BaseGV)
return false;
// No reg+reg+imm addressing.
if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
return false;
// check reg + imm case:
// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
uint64_t NumBytes = 0;
if (Ty->isSized()) {
uint64_t NumBits = DL.getTypeSizeInBits(Ty);
NumBytes = NumBits / 8;
if (!isPowerOf2_64(NumBits))
NumBytes = 0;
}
if (!AM.Scale) {
int64_t Offset = AM.BaseOffs;
// 9-bit signed offset
if (isInt<9>(Offset))
return true;
// 12-bit unsigned offset
unsigned shift = Log2_64(NumBytes);
if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
// Must be a multiple of NumBytes (NumBytes is a power of 2)
(Offset >> shift) << shift == Offset)
return true;
return false;
}
// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
}
int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
unsigned AS) const {
// Scaling factors are not free at all.
// Operands | Rt Latency
// -------------------------------------------
// Rt, [Xn, Xm] | 4
// -------------------------------------------
// Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
// Rt, [Xn, Wm, <extend> #imm] |
if (isLegalAddressingMode(DL, AM, Ty, AS))
// Scale represents reg2 * scale, thus account for 1 if
// it is not equal to 0 or 1.
return AM.Scale != 0 && AM.Scale != 1;
return -1;
}
bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
VT = VT.getScalarType();
if (!VT.isSimple())
return false;
switch (VT.getSimpleVT().SimpleTy) {
case MVT::f32:
case MVT::f64:
return true;
default:
break;
}
return false;
}
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
// site. Hence we include LR in the scratch registers, which are in turn added
// as implicit-defs for stackmaps and patchpoints.
static const MCPhysReg ScratchRegs[] = {
AArch64::X16, AArch64::X17, AArch64::LR, 0
};
return ScratchRegs;
}
bool
AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
EVT VT = N->getValueType(0);
// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
// it with shift to let it be lowered to UBFX.
if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
isa<ConstantSDNode>(N->getOperand(1))) {
uint64_t TruncMask = N->getConstantOperandVal(1);
if (isMask_64(TruncMask) &&
N->getOperand(0).getOpcode() == ISD::SRL &&
isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
return false;
}
return true;
}
bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
if (BitSize == 0)
return false;
int64_t Val = Imm.getSExtValue();
if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
return true;
if ((int64_t)Val < 0)
Val = ~Val;
if (BitSize == 32)
Val &= (1LL << 32) - 1;
unsigned LZ = countLeadingZeros((uint64_t)Val);
unsigned Shift = (63 - LZ) / 16;
// MOVZ is free so return true for one or fewer MOVK.
return Shift < 3;
}
/// Turn vector tests of the signbit in the form of:
/// xor (sra X, elt_size(X)-1), -1
/// into:
/// cmge X, X, #0
static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
EVT VT = N->getValueType(0);
if (!Subtarget->hasNEON() || !VT.isVector())
return SDValue();
// There must be a shift right algebraic before the xor, and the xor must be a
// 'not' operation.
SDValue Shift = N->getOperand(0);
SDValue Ones = N->getOperand(1);
if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
!ISD::isBuildVectorAllOnes(Ones.getNode()))
return SDValue();
// The shift should be smearing the sign bit across each vector element.
auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
return SDValue();
return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
}
// Generate SUBS and CSEL for integer abs.
static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
SDLoc DL(N);
// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
// and change it to SUB and CSEL.
if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
N0.getOperand(0));
// Generate SUBS & CSEL.
SDValue Cmp =
DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
N0.getOperand(0), DAG.getConstant(0, DL, VT));
return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
SDValue(Cmp.getNode(), 1));
}
return SDValue();
}
static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
return performIntegerAbsCombine(N, DAG);
}
SDValue
AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
SelectionDAG &DAG,
std::vector<SDNode *> *Created) const {
AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
if (isIntDivCheap(N->getValueType(0), Attr))
return SDValue(N,0); // Lower SDIV as SDIV
// fold (sdiv X, pow2)
EVT VT = N->getValueType(0);
if ((VT != MVT::i32 && VT != MVT::i64) ||
!(Divisor.isPowerOf2() || (-Divisor).isPowerOf2()))
return SDValue();
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
unsigned Lg2 = Divisor.countTrailingZeros();
SDValue Zero = DAG.getConstant(0, DL, VT);
SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
// Add (N0 < 0) ? Pow2 - 1 : 0;
SDValue CCVal;
SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
if (Created) {
Created->push_back(Cmp.getNode());
Created->push_back(Add.getNode());
Created->push_back(CSel.getNode());
}
// Divide by pow2.
SDValue SRA =
DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
// If we're dividing by a positive value, we're done. Otherwise, we must
// negate the result.
if (Divisor.isNonNegative())
return SRA;
if (Created)
Created->push_back(SRA.getNode());
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
}
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
// The below optimizations require a constant RHS.
if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
const APInt &ConstValue = C->getAPIntValue();
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
// 64-bit is 5 cycles, so this is always a win.
// More aggressively, some multiplications N0 * C can be lowered to
// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
// e.g. 6=3*2=(2+1)*2.
// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
// which equals to (1+2)*16-(1+2).
SDValue N0 = N->getOperand(0);
// TrailingZeroes is used to test if the mul can be lowered to
// shift+add+shift.
unsigned TrailingZeroes = ConstValue.countTrailingZeros();
if (TrailingZeroes) {
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into smul or umul.
if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
isZeroExtended(N0.getNode(), DAG)))
return SDValue();
// Conservatively do not lower to shift+add+shift if the mul might be
// folded into madd or msub.
if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
N->use_begin()->getOpcode() == ISD::SUB))
return SDValue();
}
// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
// and shift+add+shift.
APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
unsigned ShiftAmt, AddSubOpc;
// Is the shifted value the LHS operand of the add/sub?
bool ShiftValUseIsN0 = true;
// Do we need to negate the result?
bool NegateResult = false;
if (ConstValue.isNonNegative()) {
// (mul x, 2^N + 1) => (add (shl x, N), x)
// (mul x, 2^N - 1) => (sub (shl x, N), x)
// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
APInt SCVMinus1 = ShiftedConstValue - 1;
APInt CVPlus1 = ConstValue + 1;
if (SCVMinus1.isPowerOf2()) {
ShiftAmt = SCVMinus1.logBase2();
AddSubOpc = ISD::ADD;
} else if (CVPlus1.isPowerOf2()) {
ShiftAmt = CVPlus1.logBase2();
AddSubOpc = ISD::SUB;
} else
return SDValue();
} else {
// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
APInt CVNegPlus1 = -ConstValue + 1;
APInt CVNegMinus1 = -ConstValue - 1;
if (CVNegPlus1.isPowerOf2()) {
ShiftAmt = CVNegPlus1.logBase2();
AddSubOpc = ISD::SUB;
ShiftValUseIsN0 = false;
} else if (CVNegMinus1.isPowerOf2()) {
ShiftAmt = CVNegMinus1.logBase2();
AddSubOpc = ISD::ADD;
NegateResult = true;
} else
return SDValue();
}
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
DAG.getConstant(ShiftAmt, DL, MVT::i64));
SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
assert(!(NegateResult && TrailingZeroes) &&
"NegateResult and TrailingZeroes cannot both be true for now.");
// Negate the result.
if (NegateResult)
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
// Shift the result.
if (TrailingZeroes)
return DAG.getNode(ISD::SHL, DL, VT, Res,
DAG.getConstant(TrailingZeroes, DL, MVT::i64));
return Res;
}
static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
SelectionDAG &DAG) {
// Take advantage of vector comparisons producing 0 or -1 in each lane to
// optimize away operation when it's from a constant.
//
// The general transformation is:
// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
// AND(VECTOR_CMP(x,y), constant2)
// constant2 = UNARYOP(constant)
// Early exit if this isn't a vector operation, the operand of the
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
if (BuildVectorSDNode *BV =
dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
// Everything checks out. Build up the new and improved node.
SDLoc DL(N);
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
N->getOperand(0)->getOperand(0), MaskConst);
SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
return Res;
}
return SDValue();
}
static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// First try to optimize away the conversion when it's conditionally from
// a constant. Vectors only.
if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
return Res;
EVT VT = N->getValueType(0);
if (VT != MVT::f32 && VT != MVT::f64)
return SDValue();
// Only optimize when the source and destination types have the same width.
if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
return SDValue();
// If the result of an integer load is only used by an integer-to-float
// conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
// This eliminates an "integer-to-vector-move" UOP and improves throughput.
SDValue N0 = N->getOperand(0);
if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
// Do not change the width of a volatile load.
!cast<LoadSDNode>(N0)->isVolatile()) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getPointerInfo(), LN0->getAlignment(),
LN0->getMemOperand()->getFlags());
// Make sure successors of the original load stay after it by updating them
// to use the new Chain.
DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
unsigned Opcode =
(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
return DAG.getNode(Opcode, SDLoc(N), VT, Load);
}
return SDValue();
}
/// Fold a floating-point multiply by power of two into floating-point to
/// fixed-point conversion.
static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
Op.getOpcode() != ISD::FMUL)
return SDValue();
SDValue ConstVec = Op->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
uint32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
uint32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., float -> i64).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t Bits = IntBits == 64 ? 64 : 32;
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
if (C == -1 || C == 0 || C > Bits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
default:
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
assert((ResTy != MVT::v4i64 || DCI.isBeforeLegalizeOps()) &&
"Illegal vector type after legalization");
SDLoc DL(N);
bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
: Intrinsic::aarch64_neon_vcvtfp2fxu;
SDValue FixConv =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
// We can handle smaller integers by generating an extra trunc.
if (IntBits < FloatBits)
FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
return FixConv;
}
/// Fold a floating-point divide by power of two into fixed-point to
/// floating-point conversion.
static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue Op = N->getOperand(0);
unsigned Opc = Op->getOpcode();
if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
!Op.getOperand(0).getValueType().isSimple() ||
(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
return SDValue();
SDValue ConstVec = N->getOperand(1);
if (!isa<BuildVectorSDNode>(ConstVec))
return SDValue();
MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
int32_t IntBits = IntTy.getSizeInBits();
if (IntBits != 16 && IntBits != 32 && IntBits != 64)
return SDValue();
MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
int32_t FloatBits = FloatTy.getSizeInBits();
if (FloatBits != 32 && FloatBits != 64)
return SDValue();
// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
if (IntBits > FloatBits)
return SDValue();
BitVector UndefElements;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
if (C == -1 || C == 0 || C > FloatBits)
return SDValue();
MVT ResTy;
unsigned NumLanes = Op.getValueType().getVectorNumElements();
switch (NumLanes) {
default:
return SDValue();
case 2:
ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
break;
case 4:
ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
break;
}
if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
return SDValue();
SDLoc DL(N);
SDValue ConvInput = Op.getOperand(0);
bool IsSigned = Opc == ISD::SINT_TO_FP;
if (IntBits < FloatBits)
ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
ResTy, ConvInput);
unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
: Intrinsic::aarch64_neon_vcvtfxu2fp;
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
DAG.getConstant(C, DL, MVT::i32));
}
/// An EXTR instruction is made up of two shifts, ORed together. This helper
/// searches for and classifies those shifts.
static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
bool &FromHi) {
if (N.getOpcode() == ISD::SHL)
FromHi = false;
else if (N.getOpcode() == ISD::SRL)
FromHi = true;
else
return false;
if (!isa<ConstantSDNode>(N.getOperand(1)))
return false;
ShiftAmount = N->getConstantOperandVal(1);
Src = N->getOperand(0);
return true;
}
/// EXTR instruction extracts a contiguous chunk of bits from two existing
/// registers viewed as a high/low pair. This function looks for the pattern:
/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
/// EXTR. Can't quite be done in TableGen because the two immediates aren't
/// independent.
static SDValue tryCombineToEXTR(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
EVT VT = N->getValueType(0);
assert(N->getOpcode() == ISD::OR && "Unexpected root");
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
SDValue LHS;
uint32_t ShiftLHS = 0;
bool LHSFromHi = false;
if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
return SDValue();
SDValue RHS;
uint32_t ShiftRHS = 0;
bool RHSFromHi = false;
if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
return SDValue();
// If they're both trying to come from the high part of the register, they're
// not really an EXTR.
if (LHSFromHi == RHSFromHi)
return SDValue();
if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
return SDValue();
if (LHSFromHi) {
std::swap(LHS, RHS);
std::swap(ShiftLHS, ShiftRHS);
}
return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
DAG.getConstant(ShiftRHS, DL, MVT::i64));
}
static SDValue tryCombineToBSL(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
if (!VT.isVector())
return SDValue();
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() != ISD::AND)
return SDValue();
SDValue N1 = N->getOperand(1);
if (N1.getOpcode() != ISD::AND)
return SDValue();
// We only have to look for constant vectors here since the general, variable
// case can be handled in TableGen.
unsigned Bits = VT.getScalarSizeInBits();
uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
for (int i = 1; i >= 0; --i)
for (int j = 1; j >= 0; --j) {
BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
if (!BVN0 || !BVN1)
continue;
bool FoundMatch = true;
for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
if (!CN0 || !CN1 ||
CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
FoundMatch = false;
break;
}
}
if (FoundMatch)
return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
N0->getOperand(1 - i), N1->getOperand(1 - j));
}
return SDValue();
}
static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
if (SDValue Res = tryCombineToEXTR(N, DCI))
return Res;
if (SDValue Res = tryCombineToBSL(N, DCI))
return Res;
return SDValue();
}
static SDValue performSRLCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
SDValue N0 = N->getOperand(0);
if (N0.getOpcode() == ISD::BSWAP) {
SDLoc DL(N);
SDValue N1 = N->getOperand(1);
SDValue N00 = N0.getOperand(0);
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
uint64_t ShiftAmt = C->getZExtValue();
if (VT == MVT::i32 && ShiftAmt == 16 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
if (VT == MVT::i64 && ShiftAmt == 32 &&
DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
}
}
return SDValue();
}
static SDValue performBitcastCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Remove extraneous bitcasts around an extract_subvector.
// For example,
// (v4i16 (bitconvert
// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
// becomes
// (extract_subvector ((v8i16 ...), (i64 4)))
// Only interested in 64-bit vectors as the ultimate result.
EVT VT = N->getValueType(0);
if (!VT.isVector())
return SDValue();
if (VT.getSimpleVT().getSizeInBits() != 64)
return SDValue();
// Is the operand an extract_subvector starting at the beginning or halfway
// point of the vector? A low half may also come through as an
// EXTRACT_SUBREG, so look for that, too.
SDValue Op0 = N->getOperand(0);
if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
!(Op0->isMachineOpcode() &&
Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
return SDValue();
uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
return SDValue();
} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
if (idx != AArch64::dsub)
return SDValue();
// The dsub reference is equivalent to a lane zero subvector reference.
idx = 0;
}
// Look through the bitcast of the input to the extract.
if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
return SDValue();
SDValue Source = Op0->getOperand(0)->getOperand(0);
// If the source type has twice the number of elements as our destination
// type, we know this is an extract of the high or low half of the vector.
EVT SVT = Source->getValueType(0);
if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
return SDValue();
DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
// Create the simplified form to just extract the low or high half of the
// vector directly rather than bothering with the bitcasts.
SDLoc dl(N);
unsigned NumElements = VT.getVectorNumElements();
if (idx) {
SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
} else {
SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
Source, SubReg),
0);
}
}
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
SDLoc dl(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
// Optimize concat_vectors of truncated vectors, where the intermediate
// type is illegal, to avoid said illegality, e.g.,
// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
// (v2i16 (truncate (v2i64)))))
// ->
// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
// (v4i32 (bitcast (v2i64))),
// <0, 2, 4, 6>)))
// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
// on both input and result type, so we might generate worse code.
// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
if (N->getNumOperands() == 2 &&
N0->getOpcode() == ISD::TRUNCATE &&
N1->getOpcode() == ISD::TRUNCATE) {
SDValue N00 = N0->getOperand(0);
SDValue N10 = N1->getOperand(0);
EVT N00VT = N00.getValueType();
if (N00VT == N10.getValueType() &&
(N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
for (size_t i = 0; i < Mask.size(); ++i)
Mask[i] = i * 2;
return DAG.getNode(ISD::TRUNCATE, dl, VT,
DAG.getVectorShuffle(
MidVT, dl,
DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
}
}
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
// splat. The indexed instructions are going to be expecting a DUPLANE64, so
// canonicalise to that.
if (N0 == N1 && VT.getVectorNumElements() == 2) {
assert(VT.getScalarSizeInBits() == 64);
return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
DAG.getConstant(0, dl, MVT::i64));
}
// Canonicalise concat_vectors so that the right-hand vector has as few
// bit-casts as possible before its real operation. The primary matching
// destination for these operations will be the narrowing "2" instructions,
// which depend on the operation being performed on this right-hand vector.
// For example,
// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
// becomes
// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
if (N1->getOpcode() != ISD::BITCAST)
return SDValue();
SDValue RHS = N1->getOperand(0);
MVT RHSTy = RHS.getValueType().getSimpleVT();
// If the RHS is not a vector, this is not the pattern we're looking for.
if (!RHSTy.isVector())
return SDValue();
DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
RHSTy.getVectorNumElements() * 2);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
RHS));
}
static SDValue tryCombineFixedPointConvert(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// Wait 'til after everything is legalized to try this. That way we have
// legal vector types and such.
if (DCI.isBeforeLegalizeOps())
return SDValue();
// Transform a scalar conversion of a value from a lane extract into a
// lane extract of a vector conversion. E.g., from foo1 to foo2:
// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
//
// The second form interacts better with instruction selection and the
// register allocator to avoid cross-class register copies that aren't
// coalescable due to a lane reference.
// Check the operand and see if it originates from a lane extract.
SDValue Op1 = N->getOperand(1);
if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
// Yep, no additional predication needed. Perform the transform.
SDValue IID = N->getOperand(0);
SDValue Shift = N->getOperand(2);
SDValue Vec = Op1.getOperand(0);
SDValue Lane = Op1.getOperand(1);
EVT ResTy = N->getValueType(0);
EVT VecResTy;
SDLoc DL(N);
// The vector width should be 128 bits by the time we get here, even
// if it started as 64 bits (the extract_vector handling will have
// done so).
assert(Vec.getValueSizeInBits() == 128 &&
"unexpected vector size on extract_vector_elt!");
if (Vec.getValueType() == MVT::v4i32)
VecResTy = MVT::v4f32;
else if (Vec.getValueType() == MVT::v2i64)
VecResTy = MVT::v2f64;
else
llvm_unreachable("unexpected vector type!");
SDValue Convert =
DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
}
return SDValue();
}
// AArch64 high-vector "long" operations are formed by performing the non-high
// version on an extract_subvector of each operand which gets the high half:
//
// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
//
// However, there are cases which don't have an extract_high explicitly, but
// have another operation that can be made compatible with one for free. For
// example:
//
// (dupv64 scalar) --> (extract_high (dup128 scalar))
//
// This routine does the actual conversion of such DUPs, once outer routines
// have determined that everything else is in order.
// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
// similarly here.
static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
switch (N.getOpcode()) {
case AArch64ISD::DUP:
case AArch64ISD::DUPLANE8:
case AArch64ISD::DUPLANE16:
case AArch64ISD::DUPLANE32:
case AArch64ISD::DUPLANE64:
case AArch64ISD::MOVI:
case AArch64ISD::MOVIshift:
case AArch64ISD::MOVIedit:
case AArch64ISD::MOVImsl:
case AArch64ISD::MVNIshift:
case AArch64ISD::MVNImsl:
break;
default:
// FMOV could be supported, but isn't very useful, as it would only occur
// if you passed a bitcast' floating point immediate to an eligible long
// integer op (addl, smull, ...).
return SDValue();
}
MVT NarrowTy = N.getSimpleValueType();
if (!NarrowTy.is64BitVector())
return SDValue();
MVT ElementTy = NarrowTy.getVectorElementType();
unsigned NumElems = NarrowTy.getVectorNumElements();
MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
DAG.getConstant(NumElems, dl, MVT::i64));
}
static bool isEssentiallyExtractSubvector(SDValue N) {
if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
return true;
return N.getOpcode() == ISD::BITCAST &&
N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
}
/// \brief Helper structure to keep track of ISD::SET_CC operands.
struct GenericSetCCInfo {
const SDValue *Opnd0;
const SDValue *Opnd1;
ISD::CondCode CC;
};
/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
struct AArch64SetCCInfo {
const SDValue *Cmp;
AArch64CC::CondCode CC;
};
/// \brief Helper structure to keep track of SetCC information.
union SetCCInfo {
GenericSetCCInfo Generic;
AArch64SetCCInfo AArch64;
};
/// \brief Helper structure to be able to read SetCC information. If set to
/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
/// GenericSetCCInfo.
struct SetCCInfoAndKind {
SetCCInfo Info;
bool IsAArch64;
};
/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
/// an
/// AArch64 lowered one.
/// \p SetCCInfo is filled accordingly.
/// \post SetCCInfo is meanginfull only when this function returns true.
/// \return True when Op is a kind of SET_CC operation.
static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
// If this is a setcc, this is straight forward.
if (Op.getOpcode() == ISD::SETCC) {
SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
SetCCInfo.IsAArch64 = false;
return true;
}
// Otherwise, check if this is a matching csel instruction.
// In other words:
// - csel 1, 0, cc
// - csel 0, 1, !cc
if (Op.getOpcode() != AArch64ISD::CSEL)
return false;
// Set the information about the operands.
// TODO: we want the operands of the Cmp not the csel
SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
SetCCInfo.IsAArch64 = true;
SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
// Check that the operands matches the constraints:
// (1) Both operands must be constants.
// (2) One must be 1 and the other must be 0.
ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
// Check (1).
if (!TValue || !FValue)
return false;
// Check (2).
if (!TValue->isOne()) {
// Update the comparison when we are interested in !cc.
std::swap(TValue, FValue);
SetCCInfo.Info.AArch64.CC =
AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
}
return TValue->isOne() && FValue->isNullValue();
}
// Returns true if Op is setcc or zext of setcc.
static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
if (isSetCC(Op, Info))
return true;
return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
isSetCC(Op->getOperand(0), Info));
}
// The folding we want to perform is:
// (add x, [zext] (setcc cc ...) )
// -->
// (csel x, (add x, 1), !cc ...)
//
// The latter will get matched to a CSINC instruction.
static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
SDValue LHS = Op->getOperand(0);
SDValue RHS = Op->getOperand(1);
SetCCInfoAndKind InfoAndKind;
// If neither operand is a SET_CC, give up.
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
std::swap(LHS, RHS);
if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
return SDValue();
}
// FIXME: This could be generatized to work for FP comparisons.
EVT CmpVT = InfoAndKind.IsAArch64
? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
: InfoAndKind.Info.Generic.Opnd0->getValueType();
if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
return SDValue();
SDValue CCVal;
SDValue Cmp;
SDLoc dl(Op);
if (InfoAndKind.IsAArch64) {
CCVal = DAG.getConstant(
AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
MVT::i32);
Cmp = *InfoAndKind.Info.AArch64.Cmp;
} else
Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
*InfoAndKind.Info.Generic.Opnd1,
ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
CCVal, DAG, dl);
EVT VT = Op->getValueType(0);
LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
}
// The basic add/sub long vector instructions have variants with "2" on the end
// which act on the high-half of their inputs. They are normally matched by
// patterns like:
//
// (add (zeroext (extract_high LHS)),
// (zeroext (extract_high RHS)))
// -> uaddl2 vD, vN, vM
//
// However, if one of the extracts is something like a duplicate, this
// instruction can still be used profitably. This function puts the DAG into a
// more appropriate form for those patterns to trigger.
static SDValue performAddSubLongCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
MVT VT = N->getSimpleValueType(0);
if (!VT.is128BitVector()) {
if (N->getOpcode() == ISD::ADD)
return performSetccAddFolding(N, DAG);
return SDValue();
}
// Make sure both branches are extended in the same way.
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
LHS.getOpcode() != ISD::SIGN_EXTEND) ||
LHS.getOpcode() != RHS.getOpcode())
return SDValue();
unsigned ExtType = LHS.getOpcode();
// It's not worth doing if at least one of the inputs isn't already an
// extract, but we don't know which it'll be so we have to try both.
if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
if (!RHS.getNode())
return SDValue();
RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
} else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
if (!LHS.getNode())
return SDValue();
LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
}
return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
}
// Massage DAGs which we can use the high-half "long" operations on into
// something isel will recognize better. E.g.
//
// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
// (aarch64_neon_umull (extract_high (v2i64 vec)))
// (extract_high (v2i64 (dup128 scalar)))))
//
static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SDValue LHS = N->getOperand(1);
SDValue RHS = N->getOperand(2);
assert(LHS.getValueType().is64BitVector() &&
RHS.getValueType().is64BitVector() &&
"unexpected shape for long operation");
// Either node could be a DUP, but it's not worth doing both of them (you'd
// just as well use the non-high version) so look for a corresponding extract
// operation on the other "wing".
if (isEssentiallyExtractSubvector(LHS)) {
RHS = tryExtendDUPToExtractHigh(RHS, DAG);
if (!RHS.getNode())
return SDValue();
} else if (isEssentiallyExtractSubvector(RHS)) {
LHS = tryExtendDUPToExtractHigh(LHS, DAG);
if (!LHS.getNode())
return SDValue();
}
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
N->getOperand(0), LHS, RHS);
}
static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
MVT ElemTy = N->getSimpleValueType(0).getScalarType();
unsigned ElemBits = ElemTy.getSizeInBits();
int64_t ShiftAmount;
if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
APInt SplatValue, SplatUndef;
unsigned SplatBitSize;
bool HasAnyUndefs;
if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
HasAnyUndefs, ElemBits) ||
SplatBitSize != ElemBits)
return SDValue();
ShiftAmount = SplatValue.getSExtValue();
} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
ShiftAmount = CVN->getSExtValue();
} else
return SDValue();
unsigned Opcode;
bool IsRightShift;
switch (IID) {
default:
llvm_unreachable("Unknown shift intrinsic");
case Intrinsic::aarch64_neon_sqshl:
Opcode = AArch64ISD::SQSHL_I;
IsRightShift = false;
break;
case Intrinsic::aarch64_neon_uqshl:
Opcode = AArch64ISD::UQSHL_I;
IsRightShift = false;
break;
case Intrinsic::aarch64_neon_srshl:
Opcode = AArch64ISD::SRSHR_I;
IsRightShift = true;
break;
case Intrinsic::aarch64_neon_urshl:
Opcode = AArch64ISD::URSHR_I;
IsRightShift = true;
break;
case Intrinsic::aarch64_neon_sqshlu:
Opcode = AArch64ISD::SQSHLU_I;
IsRightShift = false;
break;
}
if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(-ShiftAmount, dl, MVT::i32));
} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
SDLoc dl(N);
return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
DAG.getConstant(ShiftAmount, dl, MVT::i32));
}
return SDValue();
}
// The CRC32[BH] instructions ignore the high bits of their data operand. Since
// the intrinsics must be legal and take an i32, this means there's almost
// certainly going to be a zext in the DAG which we can eliminate.
static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
SDValue AndN = N->getOperand(2);
if (AndN.getOpcode() != ISD::AND)
return SDValue();
ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
if (!CMask || CMask->getZExtValue() != Mask)
return SDValue();
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
}
static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
SelectionDAG &DAG) {
SDLoc dl(N);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
DAG.getNode(Opc, dl,
N->getOperand(1).getSimpleValueType(),
N->getOperand(1)),
DAG.getConstant(0, dl, MVT::i64));
}
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
unsigned IID = getIntrinsicID(N);
switch (IID) {
default:
break;
case Intrinsic::aarch64_neon_vcvtfxs2fp:
case Intrinsic::aarch64_neon_vcvtfxu2fp:
return tryCombineFixedPointConvert(N, DCI, DAG);
case Intrinsic::aarch64_neon_saddv:
return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
case Intrinsic::aarch64_neon_uaddv:
return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
case Intrinsic::aarch64_neon_sminv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
case Intrinsic::aarch64_neon_uminv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
case Intrinsic::aarch64_neon_smaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
case Intrinsic::aarch64_neon_umaxv:
return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
case Intrinsic::aarch64_neon_fmax:
return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmin:
return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fmaxnm:
return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_fminnm:
return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
N->getOperand(1), N->getOperand(2));
case Intrinsic::aarch64_neon_smull:
case Intrinsic::aarch64_neon_umull:
case Intrinsic::aarch64_neon_pmull:
case Intrinsic::aarch64_neon_sqdmull:
return tryCombineLongOpWithDup(IID, N, DCI, DAG);
case Intrinsic::aarch64_neon_sqshl:
case Intrinsic::aarch64_neon_uqshl:
case Intrinsic::aarch64_neon_sqshlu:
case Intrinsic::aarch64_neon_srshl:
case Intrinsic::aarch64_neon_urshl:
return tryCombineShiftImm(IID, N, DAG);
case Intrinsic::aarch64_crc32b:
case Intrinsic::aarch64_crc32cb:
return tryCombineCRC32(0xff, N, DAG);
case Intrinsic::aarch64_crc32h:
case Intrinsic::aarch64_crc32ch:
return tryCombineCRC32(0xffff, N, DAG);
}
return SDValue();
}
static SDValue performExtendCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
// we can convert that DUP into another extract_high (of a bigger DUP), which
// helps the backend to decide that an sabdl2 would be useful, saving a real
// extract_high operation.
if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
SDNode *ABDNode = N->getOperand(0).getNode();
unsigned IID = getIntrinsicID(ABDNode);
if (IID == Intrinsic::aarch64_neon_sabd ||
IID == Intrinsic::aarch64_neon_uabd) {
SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
if (!NewABD.getNode())
return SDValue();
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
NewABD);
}
}
// This is effectively a custom type legalization for AArch64.
//
// Type legalization will split an extend of a small, legal, type to a larger
// illegal type by first splitting the destination type, often creating
// illegal source types, which then get legalized in isel-confusing ways,
// leading to really terrible codegen. E.g.,
// %result = v8i32 sext v8i8 %value
// becomes
// %losrc = extract_subreg %value, ...
// %hisrc = extract_subreg %value, ...
// %lo = v4i32 sext v4i8 %losrc
// %hi = v4i32 sext v4i8 %hisrc
// Things go rapidly downhill from there.
//
// For AArch64, the [sz]ext vector instructions can only go up one element
// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
// take two instructions.
//
// This implies that the most efficient way to do the extend from v8i8
// to two v4i32 values is to first extend the v8i8 to v8i16, then do
// the normal splitting to happen for the v8i16->v8i32.
// This is pre-legalization to catch some cases where the default
// type legalization will create ill-tempered code.
if (!DCI.isBeforeLegalizeOps())
return SDValue();
// We're only interested in cleaning things up for non-legal vector types
// here. If both the source and destination are legal, things will just
// work naturally without any fiddling.
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
EVT ResVT = N->getValueType(0);
if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
return SDValue();
// If the vector type isn't a simple VT, it's beyond the scope of what
// we're worried about here. Let legalization do its thing and hope for
// the best.
SDValue Src = N->getOperand(0);
EVT SrcVT = Src->getValueType(0);
if (!ResVT.isSimple() || !SrcVT.isSimple())
return SDValue();
// If the source VT is a 64-bit vector, we can play games and get the
// better results we want.
if (SrcVT.getSizeInBits() != 64)
return SDValue();
unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
unsigned ElementCount = SrcVT.getVectorNumElements();
SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
SDLoc DL(N);
Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
// Now split the rest of the operation into two halves, each with a 64
// bit source.
EVT LoVT, HiVT;
SDValue Lo, Hi;
unsigned NumElements = ResVT.getVectorNumElements();
assert(!(NumElements & 1) && "Splitting vector, but not in half!");
LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
ResVT.getVectorElementType(), NumElements / 2);
EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
LoVT.getVectorNumElements());
Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(0, DL, MVT::i64));
Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
// Now combine the parts back together so we still have a single result
// like the combiner expects.
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
}
static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
SDValue SplatVal, unsigned NumVecElts) {
unsigned OrigAlignment = St.getAlignment();
unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
// Create scalar stores. This is at least as good as the code sequence for a
// split unaligned store which is a dup.s, ext.b, and two stores.
// Most of the time the three stores should be replaced by store pair
// instructions (stp).
SDLoc DL(&St);
SDValue BasePtr = St.getBasePtr();
+ const MachinePointerInfo &PtrInfo = St.getPointerInfo();
SDValue NewST1 =
- DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
+ DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
OrigAlignment, St.getMemOperand()->getFlags());
unsigned Offset = EltOffset;
while (--NumVecElts) {
unsigned Alignment = MinAlign(OrigAlignment, Offset);
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(Offset, DL, MVT::i64));
NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
- St.getPointerInfo(), Alignment,
+ PtrInfo.getWithOffset(Offset), Alignment,
St.getMemOperand()->getFlags());
Offset += EltOffset;
}
return NewST1;
}
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
/// if the zero constant is not re-used, since one instructions and one register
/// live range will be removed.
///
/// For example, the final generated code should be:
///
/// stp xzr, xzr, [x0]
///
/// instead of:
///
/// movi v0.2d, #0
/// str q0, [x0]
///
static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
// 2, 3 or 4 i32 elements.
int NumVecElts = VT.getVectorNumElements();
if (!(((NumVecElts == 2 || NumVecElts == 3) &&
VT.getVectorElementType().getSizeInBits() == 64) ||
((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
VT.getVectorElementType().getSizeInBits() == 32)))
return SDValue();
if (StVal.getOpcode() != ISD::BUILD_VECTOR)
return SDValue();
// If the zero constant has more than one use then the vector store could be
// better since the constant mov will be amortized and stp q instructions
// should be able to be formed.
if (!StVal.hasOneUse())
return SDValue();
// If the immediate offset of the address operand is too large for the stp
// instruction, then bail out.
if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
if (Offset < -512 || Offset > 504)
return SDValue();
}
for (int I = 0; I < NumVecElts; ++I) {
SDValue EltVal = StVal.getOperand(I);
if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
return SDValue();
}
// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
// undoing this transformation.
SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
? DAG.getRegister(AArch64::WZR, MVT::i32)
: DAG.getRegister(AArch64::XZR, MVT::i64);
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
}
/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
/// value. The load store optimizer pass will merge them to store pair stores.
/// This has better performance than a splat of the scalar followed by a split
/// vector store. Even if the stores are not merged it is four stores vs a dup,
/// followed by an ext.b and two stores.
static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
SDValue StVal = St.getValue();
EVT VT = StVal.getValueType();
// Don't replace floating point stores, they possibly won't be transformed to
// stp because of the store pair suppress pass.
if (VT.isFloatingPoint())
return SDValue();
// We can express a splat as store pair(s) for 2 or 4 elements.
unsigned NumVecElts = VT.getVectorNumElements();
if (NumVecElts != 4 && NumVecElts != 2)
return SDValue();
// Check that this is a splat.
// Make sure that each of the relevant vector element locations are inserted
// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
SDValue SplatVal;
for (unsigned I = 0; I < NumVecElts; ++I) {
// Check for insert vector elements.
if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
return SDValue();
// Check that same value is inserted at each vector element.
if (I == 0)
SplatVal = StVal.getOperand(1);
else if (StVal.getOperand(1) != SplatVal)
return SDValue();
// Check insert element index.
ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
if (!CIndex)
return SDValue();
uint64_t IndexVal = CIndex->getZExtValue();
if (IndexVal >= NumVecElts)
return SDValue();
IndexNotInserted.reset(IndexVal);
StVal = StVal.getOperand(0);
}
// Check that all vector element locations were inserted to.
if (IndexNotInserted.any())
return SDValue();
return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
}
static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (!DCI.isBeforeLegalize())
return SDValue();
StoreSDNode *S = cast<StoreSDNode>(N);
if (S->isVolatile())
return SDValue();
SDValue StVal = S->getValue();
EVT VT = StVal.getValueType();
if (!VT.isVector())
return SDValue();
// If we get a splat of zeros, convert this vector store to a store of
// scalars. They will be merged into store pairs of xzr thereby removing one
// instruction and one register.
if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
return ReplacedZeroSplat;
// FIXME: The logic for deciding if an unaligned store should be split should
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.
if (DAG.getMachineFunction().getFunction()->optForMinSize())
return SDValue();
// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
// those up regresses performance on micro-benchmarks and olden/bh.
if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
return SDValue();
// Split unaligned 16B stores. They are terrible for performance.
// Don't split stores with alignment of 1 or 2. Code that uses clang vector
// extensions can use this to mark that it does not want splitting to happen
// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
// eliminating alignment hazards is only 1 in 8 for alignment of 2.
if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
S->getAlignment() <= 2)
return SDValue();
// If we get a splat of a scalar convert this vector store to a store of
// scalars. They will be merged into store pairs thereby removing two
// instructions.
if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
return ReplacedSplat;
SDLoc DL(S);
unsigned NumElts = VT.getVectorNumElements() / 2;
// Split VT into two.
EVT HalfVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(0, DL, MVT::i64));
SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
DAG.getConstant(NumElts, DL, MVT::i64));
SDValue BasePtr = S->getBasePtr();
SDValue NewST1 =
DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
S->getAlignment(), S->getMemOperand()->getFlags());
SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
DAG.getConstant(8, DL, MVT::i64));
return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
S->getPointerInfo(), S->getAlignment(),
S->getMemOperand()->getFlags());
}
/// Target-specific DAG combine function for post-increment LD1 (lane) and
/// post-increment LD1R.
static SDValue performPostLD1Combine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
bool IsLaneOp) {
if (DCI.isBeforeLegalizeOps())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
EVT VT = N->getValueType(0);
unsigned LoadIdx = IsLaneOp ? 1 : 0;
SDNode *LD = N->getOperand(LoadIdx).getNode();
// If it is not LOAD, can not do such combine.
if (LD->getOpcode() != ISD::LOAD)
return SDValue();
LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
EVT MemVT = LoadSDN->getMemoryVT();
// Check if memory operand is the same type as the vector element.
if (MemVT != VT.getVectorElementType())
return SDValue();
// Check if there are other uses. If so, do not combine as it will introduce
// an extra load.
for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
++UI) {
if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
continue;
if (*UI != N)
return SDValue();
}
SDValue Addr = LD->getOperand(1);
SDValue Vector = N->getOperand(0);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD
|| UI.getUse().getResNo() != Addr.getResNo())
continue;
// Check that the add is independent of the load. Otherwise, folding it
// would create a cycle.
if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
continue;
// Also check that add is not used in the vector operand. This would also
// create a cycle.
if (User->isPredecessorOf(Vector.getNode()))
continue;
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = VT.getScalarSizeInBits() / 8;
if (IncVal != NumBytes)
continue;
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
// Finally, check that the vector doesn't depend on the load.
// Again, this would create a cycle.
// The load depending on the vector is fine, as that's the case for the
// LD1*post we'll eventually generate anyway.
if (LoadSDN->isPredecessorOf(Vector.getNode()))
continue;
SmallVector<SDValue, 8> Ops;
Ops.push_back(LD->getOperand(0)); // Chain
if (IsLaneOp) {
Ops.push_back(Vector); // The vector to be inserted
Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
}
Ops.push_back(Addr);
Ops.push_back(Inc);
EVT Tys[3] = { VT, MVT::i64, MVT::Other };
SDVTList SDTys = DAG.getVTList(Tys);
unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
MemVT,
LoadSDN->getMemOperand());
// Update the uses.
SDValue NewResults[] = {
SDValue(LD, 0), // The result of load
SDValue(UpdN.getNode(), 2) // Chain
};
DCI.CombineTo(LD, NewResults);
DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
break;
}
return SDValue();
}
/// Simplify \Addr given that the top byte of it is ignored by HW during
/// address translation.
static bool performTBISimplification(SDValue Addr,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
APInt DemandedMask = APInt::getLowBitsSet(64, 56);
APInt KnownZero, KnownOne;
TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(Addr, DemandedMask, KnownZero, KnownOne, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
return true;
}
return false;
}
static SDValue performSTORECombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
return Split;
if (Subtarget->supportsAddressTopByteIgnored() &&
performTBISimplification(N->getOperand(2), DCI, DAG))
return SDValue(N, 0);
return SDValue();
}
/// This function handles the log2-shuffle pattern produced by the
/// LoopVectorizer for the across vector reduction. It consists of
/// log2(NumVectorElements) steps and, in each step, 2^(s) elements
/// are reduced, where s is an induction variable from 0 to
/// log2(NumVectorElements).
static SDValue tryMatchAcrossLaneShuffleForReduction(SDNode *N, SDValue OpV,
unsigned Op,
SelectionDAG &DAG) {
EVT VTy = OpV->getOperand(0).getValueType();
if (!VTy.isVector())
return SDValue();
int NumVecElts = VTy.getVectorNumElements();
if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
if (NumVecElts != 4)
return SDValue();
} else {
if (NumVecElts != 4 && NumVecElts != 8 && NumVecElts != 16)
return SDValue();
}
int NumExpectedSteps = APInt(8, NumVecElts).logBase2();
SDValue PreOp = OpV;
// Iterate over each step of the across vector reduction.
for (int CurStep = 0; CurStep != NumExpectedSteps; ++CurStep) {
SDValue CurOp = PreOp.getOperand(0);
SDValue Shuffle = PreOp.getOperand(1);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE) {
// Try to swap the 1st and 2nd operand as add and min/max instructions
// are commutative.
CurOp = PreOp.getOperand(1);
Shuffle = PreOp.getOperand(0);
if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
return SDValue();
}
// Check if the input vector is fed by the operator we want to handle,
// except the last step; the very first input vector is not necessarily
// the same operator we are handling.
if (CurOp.getOpcode() != Op && (CurStep != (NumExpectedSteps - 1)))
return SDValue();
// Check if it forms one step of the across vector reduction.
// E.g.,
// %cur = add %1, %0
// %shuffle = vector_shuffle %cur, <2, 3, u, u>
// %pre = add %cur, %shuffle
if (Shuffle.getOperand(0) != CurOp)
return SDValue();
int NumMaskElts = 1 << CurStep;
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Shuffle)->getMask();
// Check mask values in each step.
// We expect the shuffle mask in each step follows a specific pattern
// denoted here by the <M, U> form, where M is a sequence of integers
// starting from NumMaskElts, increasing by 1, and the number integers
// in M should be NumMaskElts. U is a sequence of UNDEFs and the number
// of undef in U should be NumVecElts - NumMaskElts.
// E.g., for <8 x i16>, mask values in each step should be :
// step 0 : <1,u,u,u,u,u,u,u>
// step 1 : <2,3,u,u,u,u,u,u>
// step 2 : <4,5,6,7,u,u,u,u>
for (int i = 0; i < NumVecElts; ++i)
if ((i < NumMaskElts && Mask[i] != (NumMaskElts + i)) ||
(i >= NumMaskElts && !(Mask[i] < 0)))
return SDValue();
PreOp = CurOp;
}
unsigned Opcode;
bool IsIntrinsic = false;
switch (Op) {
default:
llvm_unreachable("Unexpected operator for across vector reduction");
case ISD::ADD:
Opcode = AArch64ISD::UADDV;
break;
case ISD::SMAX:
Opcode = AArch64ISD::SMAXV;
break;
case ISD::UMAX:
Opcode = AArch64ISD::UMAXV;
break;
case ISD::SMIN:
Opcode = AArch64ISD::SMINV;
break;
case ISD::UMIN:
Opcode = AArch64ISD::UMINV;
break;
case ISD::FMAXNUM:
Opcode = Intrinsic::aarch64_neon_fmaxnmv;
IsIntrinsic = true;
break;
case ISD::FMINNUM:
Opcode = Intrinsic::aarch64_neon_fminnmv;
IsIntrinsic = true;
break;
}
SDLoc DL(N);
return IsIntrinsic
? DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
DAG.getConstant(Opcode, DL, MVT::i32), PreOp)
: DAG.getNode(
ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0),
DAG.getNode(Opcode, DL, PreOp.getSimpleValueType(), PreOp),
DAG.getConstant(0, DL, MVT::i64));
}
/// Target-specific DAG combine for the across vector min/max reductions.
/// This function specifically handles the final clean-up step of the vector
/// min/max reductions produced by the LoopVectorizer. It is the log2-shuffle
/// pattern, which narrows down and finds the final min/max value from all
/// elements of the vector.
/// For example, for a <16 x i8> vector :
/// svn0 = vector_shuffle %0, undef<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u>
/// %smax0 = smax %arr, svn0
/// %svn1 = vector_shuffle %smax0, undef<4,5,6,7,u,u,u,u,u,u,u,u,u,u,u,u>
/// %smax1 = smax %smax0, %svn1
/// %svn2 = vector_shuffle %smax1, undef<2,3,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
/// %smax2 = smax %smax1, svn2
/// %svn3 = vector_shuffle %smax2, undef<1,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
/// %sc = setcc %smax2, %svn3, gt
/// %n0 = extract_vector_elt %sc, #0
/// %n1 = extract_vector_elt %smax2, #0
/// %n2 = extract_vector_elt $smax2, #1
/// %result = select %n0, %n1, n2
/// becomes :
/// %1 = smaxv %0
/// %result = extract_vector_elt %1, 0
static SDValue
performAcrossLaneMinMaxReductionCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue IfTrue = N->getOperand(1);
SDValue IfFalse = N->getOperand(2);
// Check if the SELECT merges up the final result of the min/max
// from a vector.
if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
IfTrue.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
IfFalse.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return SDValue();
// Expect N0 is fed by SETCC.
SDValue SetCC = N0.getOperand(0);
EVT SetCCVT = SetCC.getValueType();
if (SetCC.getOpcode() != ISD::SETCC || !SetCCVT.isVector() ||
SetCCVT.getVectorElementType() != MVT::i1)
return SDValue();
SDValue VectorOp = SetCC.getOperand(0);
unsigned Op = VectorOp->getOpcode();
// Check if the input vector is fed by the operator we want to handle.
if (Op != ISD::SMAX && Op != ISD::UMAX && Op != ISD::SMIN &&
Op != ISD::UMIN && Op != ISD::FMAXNUM && Op != ISD::FMINNUM)
return SDValue();
EVT VTy = VectorOp.getValueType();
if (!VTy.isVector())
return SDValue();
if (VTy.getSizeInBits() < 64)
return SDValue();
EVT EltTy = VTy.getVectorElementType();
if (Op == ISD::FMAXNUM || Op == ISD::FMINNUM) {
if (EltTy != MVT::f32)
return SDValue();
} else {
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
return SDValue();
}
// Check if extracting from the same vector.
// For example,
// %sc = setcc %vector, %svn1, gt
// %n0 = extract_vector_elt %sc, #0
// %n1 = extract_vector_elt %vector, #0
// %n2 = extract_vector_elt $vector, #1
if (!(VectorOp == IfTrue->getOperand(0) &&
VectorOp == IfFalse->getOperand(0)))
return SDValue();
// Check if the condition code is matched with the operator type.
ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
if ((Op == ISD::SMAX && CC != ISD::SETGT && CC != ISD::SETGE) ||
(Op == ISD::UMAX && CC != ISD::SETUGT && CC != ISD::SETUGE) ||
(Op == ISD::SMIN && CC != ISD::SETLT && CC != ISD::SETLE) ||
(Op == ISD::UMIN && CC != ISD::SETULT && CC != ISD::SETULE) ||
(Op == ISD::FMAXNUM && CC != ISD::SETOGT && CC != ISD::SETOGE &&
CC != ISD::SETUGT && CC != ISD::SETUGE && CC != ISD::SETGT &&
CC != ISD::SETGE) ||
(Op == ISD::FMINNUM && CC != ISD::SETOLT && CC != ISD::SETOLE &&
CC != ISD::SETULT && CC != ISD::SETULE && CC != ISD::SETLT &&
CC != ISD::SETLE))
return SDValue();
// Expect to check only lane 0 from the vector SETCC.
if (!isNullConstant(N0.getOperand(1)))
return SDValue();
// Expect to extract the true value from lane 0.
if (!isNullConstant(IfTrue.getOperand(1)))
return SDValue();
// Expect to extract the false value from lane 1.
if (!isOneConstant(IfFalse.getOperand(1)))
return SDValue();
return tryMatchAcrossLaneShuffleForReduction(N, SetCC, Op, DAG);
}
/// Target-specific DAG combine for the across vector add reduction.
/// This function specifically handles the final clean-up step of the vector
/// add reduction produced by the LoopVectorizer. It is the log2-shuffle
/// pattern, which adds all elements of a vector together.
/// For example, for a <4 x i32> vector :
/// %1 = vector_shuffle %0, <2,3,u,u>
/// %2 = add %0, %1
/// %3 = vector_shuffle %2, <1,u,u,u>
/// %4 = add %2, %3
/// %result = extract_vector_elt %4, 0
/// becomes :
/// %0 = uaddv %0
/// %result = extract_vector_elt %0, 0
static SDValue
performAcrossLaneAddReductionCombine(SDNode *N, SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
if (!Subtarget->hasNEON())
return SDValue();
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
// Check if the input vector is fed by the ADD.
if (N0->getOpcode() != ISD::ADD)
return SDValue();
// The vector extract idx must constant zero because we only expect the final
// result of the reduction is placed in lane 0.
if (!isNullConstant(N1))
return SDValue();
EVT VTy = N0.getValueType();
if (!VTy.isVector())
return SDValue();
EVT EltTy = VTy.getVectorElementType();
if (EltTy != MVT::i32 && EltTy != MVT::i16 && EltTy != MVT::i8)
return SDValue();
if (VTy.getSizeInBits() < 64)
return SDValue();
return tryMatchAcrossLaneShuffleForReduction(N, N0, ISD::ADD, DAG);
}
/// Target-specific DAG combine function for NEON load/store intrinsics
/// to merge base address updates.
static SDValue performNEONPostLDSTCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
return SDValue();
unsigned AddrOpIdx = N->getNumOperands() - 1;
SDValue Addr = N->getOperand(AddrOpIdx);
// Search for a use of the address operand that is an increment.
for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
SDNode *User = *UI;
if (User->getOpcode() != ISD::ADD ||
UI.getUse().getResNo() != Addr.getResNo())
continue;
// Check that the add is independent of the load/store. Otherwise, folding
// it would create a cycle.
if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
continue;
// Find the new opcode for the updating load/store.
bool IsStore = false;
bool IsLaneOp = false;
bool IsDupOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntNo) {
default: llvm_unreachable("unexpected intrinsic for Neon base update");
case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
NumVecs = 2; break;
case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
NumVecs = 3; break;
case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
NumVecs = 4; break;
case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
NumVecs = 2; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
NumVecs = 3; IsStore = true; break;
case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
NumVecs = 4; IsStore = true; break;
case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
NumVecs = 2; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
NumVecs = 3; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
NumVecs = 4; IsDupOp = true; break;
case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
NumVecs = 2; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
NumVecs = 3; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
NumVecs = 4; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
NumVecs = 2; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
NumVecs = 3; IsStore = true; IsLaneOp = true; break;
case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
NumVecs = 4; IsStore = true; IsLaneOp = true; break;
}
EVT VecTy;
if (IsStore)
VecTy = N->getOperand(2).getValueType();
else
VecTy = N->getValueType(0);
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
uint32_t IncVal = CInc->getZExtValue();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
if (IsLaneOp || IsDupOp)
NumBytes /= VecTy.getVectorNumElements();
if (IncVal != NumBytes)
continue;
Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
}
SmallVector<SDValue, 8> Ops;
Ops.push_back(N->getOperand(0)); // Incoming chain
// Load lane and store have vector list as input.
if (IsLaneOp || IsStore)
for (unsigned i = 2; i < AddrOpIdx; ++i)
Ops.push_back(N->getOperand(i));
Ops.push_back(Addr); // Base register
Ops.push_back(Inc);
// Return Types.
EVT Tys[6];
unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
unsigned n;
for (n = 0; n < NumResultVecs; ++n)
Tys[n] = VecTy;
Tys[n++] = MVT::i64; // Type of write back register
Tys[n] = MVT::Other; // Type of the chain
SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
MemInt->getMemoryVT(),
MemInt->getMemOperand());
// Update the uses.
std::vector<SDValue> NewResults;
for (unsigned i = 0; i < NumResultVecs; ++i) {
NewResults.push_back(SDValue(UpdN.getNode(), i));
}
NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
DCI.CombineTo(N, NewResults);
DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
break;
}
return SDValue();
}
// Checks to see if the value is the prescribed width and returns information
// about its extension mode.
static
bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
ExtType = ISD::NON_EXTLOAD;
switch(V.getNode()->getOpcode()) {
default:
return false;
case ISD::LOAD: {
LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
|| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
ExtType = LoadNode->getExtensionType();
return true;
}
return false;
}
case ISD::AssertSext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
ExtType = ISD::SEXTLOAD;
return true;
}
return false;
}
case ISD::AssertZext: {
VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
if ((TypeNode->getVT() == MVT::i8 && width == 8)
|| (TypeNode->getVT() == MVT::i16 && width == 16)) {
ExtType = ISD::ZEXTLOAD;
return true;
}
return false;
}
case ISD::Constant:
case ISD::TargetConstant: {
return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
1LL << (width - 1);
}
}
return true;
}
// This function does a whole lot of voodoo to determine if the tests are
// equivalent without and with a mask. Essentially what happens is that given a
// DAG resembling:
//
// +-------------+ +-------------+ +-------------+ +-------------+
// | Input | | AddConstant | | CompConstant| | CC |
// +-------------+ +-------------+ +-------------+ +-------------+
// | | | |
// V V | +----------+
// +-------------+ +----+ | |
// | ADD | |0xff| | |
// +-------------+ +----+ | |
// | | | |
// V V | |
// +-------------+ | |
// | AND | | |
// +-------------+ | |
// | | |
// +-----+ | |
// | | |
// V V V
// +-------------+
// | CMP |
// +-------------+
//
// The AND node may be safely removed for some combinations of inputs. In
// particular we need to take into account the extension type of the Input,
// the exact values of AddConstant, CompConstant, and CC, along with the nominal
// width of the input (this can work for any width inputs, the above graph is
// specific to 8 bits.
//
// The specific equations were worked out by generating output tables for each
// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
// problem was simplified by working with 4 bit inputs, which means we only
// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
// patterns present in both extensions (0,7). For every distinct set of
// AddConstant and CompConstants bit patterns we can consider the masked and
// unmasked versions to be equivalent if the result of this function is true for
// all 16 distinct bit patterns of for the current extension type of Input (w0).
//
// sub w8, w0, w1
// and w10, w8, #0x0f
// cmp w8, w2
// cset w9, AArch64CC
// cmp w10, w2
// cset w11, AArch64CC
// cmp w9, w11
// cset w0, eq
// ret
//
// Since the above function shows when the outputs are equivalent it defines
// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
// would be expensive to run during compiles. The equations below were written
// in a test harness that confirmed they gave equivalent outputs to the above
// for all inputs function, so they can be used determine if the removal is
// legal instead.
//
// isEquivalentMaskless() is the code for testing if the AND can be removed
// factored out of the DAG recognition as the DAG can take several forms.
static bool isEquivalentMaskless(unsigned CC, unsigned width,
ISD::LoadExtType ExtType, int AddConstant,
int CompConstant) {
// By being careful about our equations and only writing the in term
// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
// make them generally applicable to all bit widths.
int MaxUInt = (1 << width);
// For the purposes of these comparisons sign extending the type is
// equivalent to zero extending the add and displacing it by half the integer
// width. Provided we are careful and make sure our equations are valid over
// the whole range we can just adjust the input and avoid writing equations
// for sign extended inputs.
if (ExtType == ISD::SEXTLOAD)
AddConstant -= (1 << (width-1));
switch(CC) {
case AArch64CC::LE:
case AArch64CC::GT:
if ((AddConstant == 0) ||
(CompConstant == MaxUInt - 1 && AddConstant < 0) ||
(AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
return true;
break;
case AArch64CC::LT:
case AArch64CC::GE:
if ((AddConstant == 0) ||
(AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
return true;
break;
case AArch64CC::HI:
case AArch64CC::LS:
if ((AddConstant >= 0 && CompConstant < 0) ||
(AddConstant <= 0 && CompConstant >= -1 &&
CompConstant < AddConstant + MaxUInt))
return true;
break;
case AArch64CC::PL:
case AArch64CC::MI:
if ((AddConstant == 0) ||
(AddConstant > 0 && CompConstant <= 0) ||
(AddConstant < 0 && CompConstant <= AddConstant))
return true;
break;
case AArch64CC::LO:
case AArch64CC::HS:
if ((AddConstant >= 0 && CompConstant <= 0) ||
(AddConstant <= 0 && CompConstant >= 0 &&
CompConstant <= AddConstant + MaxUInt))
return true;
break;
case AArch64CC::EQ:
case AArch64CC::NE:
if ((AddConstant > 0 && CompConstant < 0) ||
(AddConstant < 0 && CompConstant >= 0 &&
CompConstant < AddConstant + MaxUInt) ||
(AddConstant >= 0 && CompConstant >= 0 &&
CompConstant >= AddConstant) ||
(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
return true;
break;
case AArch64CC::VS:
case AArch64CC::VC:
case AArch64CC::AL:
case AArch64CC::NV:
return true;
case AArch64CC::Invalid:
break;
}
return false;
}
static
SDValue performCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG, unsigned CCIndex,
unsigned CmpIndex) {
unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
unsigned CondOpcode = SubsNode->getOpcode();
if (CondOpcode != AArch64ISD::SUBS)
return SDValue();
// There is a SUBS feeding this condition. Is it fed by a mask we can
// use?
SDNode *AndNode = SubsNode->getOperand(0).getNode();
unsigned MaskBits = 0;
if (AndNode->getOpcode() != ISD::AND)
return SDValue();
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
uint32_t CNV = CN->getZExtValue();
if (CNV == 255)
MaskBits = 8;
else if (CNV == 65535)
MaskBits = 16;
}
if (!MaskBits)
return SDValue();
SDValue AddValue = AndNode->getOperand(0);
if (AddValue.getOpcode() != ISD::ADD)
return SDValue();
// The basic dag structure is correct, grab the inputs and validate them.
SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
SDValue SubsInputValue = SubsNode->getOperand(1);
// The mask is present and the provenance of all the values is a smaller type,
// lets see if the mask is superfluous.
if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
!isa<ConstantSDNode>(SubsInputValue.getNode()))
return SDValue();
ISD::LoadExtType ExtType;
if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
return SDValue();
if(!isEquivalentMaskless(CC, MaskBits, ExtType,
cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
return SDValue();
// The AND is not necessary, remove it.
SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
SubsNode->getValueType(1));
SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
return SDValue(N, 0);
}
// Optimize compare with zero and branch.
static SDValue performBRCONDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
N = NV.getNode();
SDValue Chain = N->getOperand(0);
SDValue Dest = N->getOperand(1);
SDValue CCVal = N->getOperand(2);
SDValue Cmp = N->getOperand(3);
assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return SDValue();
unsigned CmpOpc = Cmp.getOpcode();
if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
return SDValue();
// Only attempt folding if there is only one use of the flag and no use of the
// value.
if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
return SDValue();
SDValue LHS = Cmp.getOperand(0);
SDValue RHS = Cmp.getOperand(1);
assert(LHS.getValueType() == RHS.getValueType() &&
"Expected the value type to be the same for both operands!");
if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
return SDValue();
if (isNullConstant(LHS))
std::swap(LHS, RHS);
if (!isNullConstant(RHS))
return SDValue();
if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
LHS.getOpcode() == ISD::SRL)
return SDValue();
// Fold the compare into the branch instruction.
SDValue BR;
if (CC == AArch64CC::EQ)
BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
else
BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
// Do not add new nodes to DAG combiner worklist.
DCI.CombineTo(N, BR, false);
return SDValue();
}
// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
// as well as whether the test should be inverted. This code is required to
// catch these cases (as opposed to standard dag combines) because
// AArch64ISD::TBZ is matched during legalization.
static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
SelectionDAG &DAG) {
if (!Op->hasOneUse())
return Op;
// We don't handle undef/constant-fold cases below, as they should have
// already been taken care of (e.g. and of 0, test of undefined shifted bits,
// etc.)
// (tbz (trunc x), b) -> (tbz x, b)
// This case is just here to enable more of the below cases to be caught.
if (Op->getOpcode() == ISD::TRUNCATE &&
Bit < Op->getValueType(0).getSizeInBits()) {
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
if (Op->getNumOperands() != 2)
return Op;
auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!C)
return Op;
switch (Op->getOpcode()) {
default:
return Op;
// (tbz (and x, m), b) -> (tbz x, b)
case ISD::AND:
if ((C->getZExtValue() >> Bit) & 1)
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
return Op;
// (tbz (shl x, c), b) -> (tbz x, b-c)
case ISD::SHL:
if (C->getZExtValue() <= Bit &&
(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit - C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
return Op;
// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
case ISD::SRA:
Bit = Bit + C->getZExtValue();
if (Bit >= Op->getValueType(0).getSizeInBits())
Bit = Op->getValueType(0).getSizeInBits() - 1;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
// (tbz (srl x, c), b) -> (tbz x, b+c)
case ISD::SRL:
if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
Bit = Bit + C->getZExtValue();
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
return Op;
// (tbz (xor x, -1), b) -> (tbnz x, b)
case ISD::XOR:
if ((C->getZExtValue() >> Bit) & 1)
Invert = !Invert;
return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
}
}
// Optimize test single bit zero/non-zero and branch.
static SDValue performTBZCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
bool Invert = false;
SDValue TestSrc = N->getOperand(1);
SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
if (TestSrc == NewTestSrc)
return SDValue();
unsigned NewOpc = N->getOpcode();
if (Invert) {
if (NewOpc == AArch64ISD::TBZ)
NewOpc = AArch64ISD::TBNZ;
else {
assert(NewOpc == AArch64ISD::TBNZ);
NewOpc = AArch64ISD::TBZ;
}
}
SDLoc DL(N);
return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
}
// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();
if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 ||
CCVT.getVectorElementType() != MVT::i1)
return SDValue();
EVT ResVT = N->getValueType(0);
EVT CmpVT = N0.getOperand(0).getValueType();
// Only combine when the result type is of the same size as the compared
// operands.
if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
return SDValue();
SDValue IfTrue = N->getOperand(1);
SDValue IfFalse = N->getOperand(2);
SDValue SetCC =
DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
N0.getOperand(0), N0.getOperand(1),
cast<CondCodeSDNode>(N0.getOperand(2))->get());
return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
IfTrue, IfFalse);
}
/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
/// the compare-mask instructions rather than going via NZCV, even if LHS and
/// RHS are really scalar. This replaces any scalar setcc in the above pattern
/// with a vector one followed by a DUP shuffle on the result.
static SDValue performSelectCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0);
EVT ResVT = N->getValueType(0);
if (N0.getOpcode() != ISD::SETCC)
return SDValue();
// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
// scalar SetCCResultType. We also don't expect vectors, because we assume
// that selects fed by vector SETCCs are canonicalized to VSELECT.
assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
"Scalar-SETCC feeding SELECT has unexpected result type!");
// If NumMaskElts == 0, the comparison is larger than select result. The
// largest real NEON comparison is 64-bits per lane, which means the result is
// at most 32-bits and an illegal vector. Just bail out for now.
EVT SrcVT = N0.getOperand(0).getValueType();
// Don't try to do this optimization when the setcc itself has i1 operands.
// There are no legal vectors of i1, so this would be pointless.
if (SrcVT == MVT::i1)
return SDValue();
int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
if (!ResVT.isVector() || NumMaskElts == 0)
return SDValue();
SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
// Also bail out if the vector CCVT isn't the same size as ResVT.
// This can happen if the SETCC operand size doesn't divide the ResVT size
// (e.g., f64 vs v3f32).
if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
return SDValue();
// Make sure we didn't create illegal types, if we're not supposed to.
assert(DCI.isBeforeLegalize() ||
DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
// First perform a vector comparison, where lane 0 is the one we're interested
// in.
SDLoc DL(N0);
SDValue LHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
SDValue RHS =
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
// Now duplicate the comparison mask we want across all other lanes.
SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
Mask = DAG.getNode(ISD::BITCAST, DL,
ResVT.changeVectorElementTypeToInteger(), Mask);
return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
}
/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);
return SDValue();
}
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
switch (N->getOpcode()) {
default:
break;
case ISD::ADD:
case ISD::SUB:
return performAddSubLongCombine(N, DCI, DAG);
case ISD::XOR:
return performXorCombine(N, DAG, DCI, Subtarget);
case ISD::MUL:
return performMulCombine(N, DAG, DCI, Subtarget);
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performIntToFpCombine(N, DAG, Subtarget);
case ISD::FP_TO_SINT:
case ISD::FP_TO_UINT:
return performFpToIntCombine(N, DAG, DCI, Subtarget);
case ISD::FDIV:
return performFDivCombine(N, DAG, DCI, Subtarget);
case ISD::OR:
return performORCombine(N, DCI, Subtarget);
case ISD::SRL:
return performSRLCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicCombine(N, DCI, Subtarget);
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
return performExtendCombine(N, DCI, DAG);
case ISD::BITCAST:
return performBitcastCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
case ISD::SELECT: {
SDValue RV = performSelectCombine(N, DCI);
if (!RV.getNode())
RV = performAcrossLaneMinMaxReductionCombine(N, DAG, Subtarget);
return RV;
}
case ISD::VSELECT:
return performVSelectCombine(N, DCI.DAG);
case ISD::LOAD:
if (performTBISimplification(N->getOperand(1), DCI, DAG))
return SDValue(N, 0);
break;
case ISD::STORE:
return performSTORECombine(N, DCI, DAG, Subtarget);
case AArch64ISD::BRCOND:
return performBRCONDCombine(N, DCI, DAG);
case AArch64ISD::TBNZ:
case AArch64ISD::TBZ:
return performTBZCombine(N, DCI, DAG);
case AArch64ISD::CSEL:
return performCONDCombine(N, DCI, DAG, 2, 3);
case AArch64ISD::DUP:
return performPostLD1Combine(N, DCI, false);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
case ISD::INSERT_VECTOR_ELT:
return performPostLD1Combine(N, DCI, true);
case ISD::EXTRACT_VECTOR_ELT:
return performAcrossLaneAddReductionCombine(N, DAG, Subtarget);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
case Intrinsic::aarch64_neon_ld2:
case Intrinsic::aarch64_neon_ld3:
case Intrinsic::aarch64_neon_ld4:
case Intrinsic::aarch64_neon_ld1x2:
case Intrinsic::aarch64_neon_ld1x3:
case Intrinsic::aarch64_neon_ld1x4:
case Intrinsic::aarch64_neon_ld2lane:
case Intrinsic::aarch64_neon_ld3lane:
case Intrinsic::aarch64_neon_ld4lane:
case Intrinsic::aarch64_neon_ld2r:
case Intrinsic::aarch64_neon_ld3r:
case Intrinsic::aarch64_neon_ld4r:
case Intrinsic::aarch64_neon_st2:
case Intrinsic::aarch64_neon_st3:
case Intrinsic::aarch64_neon_st4:
case Intrinsic::aarch64_neon_st1x2:
case Intrinsic::aarch64_neon_st1x3:
case Intrinsic::aarch64_neon_st1x4:
case Intrinsic::aarch64_neon_st2lane:
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane:
return performNEONPostLDSTCombine(N, DCI, DAG);
default:
break;
}
}
return SDValue();
}
// Check if the return value is used as only a return value, as otherwise
// we can't perform a tail-call. In particular, we need to check for
// target ISD nodes that are returns and any other "odd" constructs
// that the generic analysis code won't necessarily catch.
bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
SDValue &Chain) const {
if (N->getNumValues() != 1)
return false;
if (!N->hasNUsesOfValue(1, 0))
return false;
SDValue TCChain = Chain;
SDNode *Copy = *N->use_begin();
if (Copy->getOpcode() == ISD::CopyToReg) {
// If the copy has a glue operand, we conservatively assume it isn't safe to
// perform a tail call.
if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
MVT::Glue)
return false;
TCChain = Copy->getOperand(0);
} else if (Copy->getOpcode() != ISD::FP_EXTEND)
return false;
bool HasRet = false;
for (SDNode *Node : Copy->uses()) {
if (Node->getOpcode() != AArch64ISD::RET_FLAG)
return false;
HasRet = true;
}
if (!HasRet)
return false;
Chain = TCChain;
return true;
}
// Return whether the an instruction can potentially be optimized to a tail
// call. This will cause the optimizers to attempt to move, or duplicate,
// return instructions to help enable tail call optimizations for this
// instruction.
bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
return CI->isTailCall();
}
bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
bool &IsInc,
SelectionDAG &DAG) const {
if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
return false;
Base = Op->getOperand(0);
// All of the indexed addressing mode instructions take a signed
// 9 bit immediate offset.
if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
int64_t RHSC = RHS->getSExtValue();
if (Op->getOpcode() == ISD::SUB)
RHSC = -(uint64_t)RHSC;
if (!isInt<9>(RHSC))
return false;
IsInc = (Op->getOpcode() == ISD::ADD);
Offset = Op->getOperand(1);
return true;
}
return false;
}
bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
SDValue &Offset,
ISD::MemIndexedMode &AM,
SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
return false;
AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
return true;
}
bool AArch64TargetLowering::getPostIndexedAddressParts(
SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
EVT VT;
SDValue Ptr;
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
VT = LD->getMemoryVT();
Ptr = LD->getBasePtr();
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
VT = ST->getMemoryVT();
Ptr = ST->getBasePtr();
} else
return false;
bool IsInc;
if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
return false;
// Post-indexing updates the base, so it's not a valid transform
// if that's not the same as the load's pointer.
if (Ptr != Base)
return false;
AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
return true;
}
static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Op = N->getOperand(0);
if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
return;
Op = SDValue(
DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
DAG.getUNDEF(MVT::i32), Op,
DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
0);
Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
}
static void ReplaceReductionResults(SDNode *N,
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG, unsigned InterOp,
unsigned AcrossOp) {
EVT LoVT, HiVT;
SDValue Lo, Hi;
SDLoc dl(N);
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
Results.push_back(SplitVal);
}
static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
SDLoc DL(N);
SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
DAG.getNode(ISD::SRL, DL, MVT::i128, N,
DAG.getConstant(64, DL, MVT::i64)));
return std::make_pair(Lo, Hi);
}
static void ReplaceCMP_SWAP_128Results(SDNode *N,
SmallVectorImpl<SDValue> & Results,
SelectionDAG &DAG) {
assert(N->getValueType(0) == MVT::i128 &&
"AtomicCmpSwap on types less than 128 should be legal");
auto Desired = splitInt128(N->getOperand(2), DAG);
auto New = splitInt128(N->getOperand(3), DAG);
SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
New.first, New.second, N->getOperand(0)};
SDNode *CmpSwap = DAG.getMachineNode(
AArch64::CMP_SWAP_128, SDLoc(N),
DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);
MachineFunction &MF = DAG.getMachineFunction();
MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);
Results.push_back(SDValue(CmpSwap, 0));
Results.push_back(SDValue(CmpSwap, 1));
Results.push_back(SDValue(CmpSwap, 3));
}
void AArch64TargetLowering::ReplaceNodeResults(
SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
switch (N->getOpcode()) {
default:
llvm_unreachable("Don't know how to custom expand this");
case ISD::BITCAST:
ReplaceBITCASTResults(N, Results, DAG);
return;
case AArch64ISD::SADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
return;
case AArch64ISD::UADDV:
ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
return;
case AArch64ISD::SMINV:
ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
return;
case AArch64ISD::UMINV:
ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
return;
case AArch64ISD::SMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
return;
case AArch64ISD::UMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
return;
case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.
return;
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_128Results(N, Results, DAG);
return;
}
}
bool AArch64TargetLowering::useLoadStackGuardNode() const {
if (!Subtarget->isTargetAndroid())
return true;
return TargetLowering::useLoadStackGuardNode();
}
unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
// Combine multiple FDIVs with the same divisor into multiple FMULs by the
// reciprocal if there are three or more FDIVs.
return 3;
}
TargetLoweringBase::LegalizeTypeAction
AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
MVT SVT = VT.getSimpleVT();
// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
// v4i16, v2i32 instead of to promote.
if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32
|| SVT == MVT::v1f32)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
return Size == 128;
}
// Loads and stores less than 128-bits are already atomic; ones above that
// are doomed anyway, so defer to the default libcall and blame the OS when
// things go wrong.
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
unsigned Size = LI->getType()->getPrimitiveSizeInBits();
return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
// For the real atomic operations, we have ldxr/stxr up to 128 bits,
TargetLowering::AtomicExpansionKind
AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
unsigned Size = AI->getType()->getPrimitiveSizeInBits();
return Size <= 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
}
bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
AtomicCmpXchgInst *AI) const {
// At -O0, fast-regalloc cannot cope with the live vregs necessary to
// implement cmpxchg without spilling. If the address being exchanged is also
// on the stack and close enough to the spill slot, this can lead to a
// situation where the monitor always gets cleared and the atomic operation
// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
return getTargetMachine().getOptLevel() != 0;
}
Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
bool IsAcquire = isAcquireOrStronger(Ord);
// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
// intrinsic must return {i64, i64} and we have to recombine them into a
// single i128 here.
if (ValTy->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
Function *Ldxr = Intrinsic::getDeclaration(M, Int);
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
return Builder.CreateOr(
Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
}
Type *Tys[] = { Addr->getType() };
Intrinsic::ID Int =
IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
return Builder.CreateTruncOrBitCast(
Builder.CreateCall(Ldxr, Addr),
cast<PointerType>(Addr->getType())->getElementType());
}
void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
IRBuilder<> &Builder) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
}
Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
Value *Val, Value *Addr,
AtomicOrdering Ord) const {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
bool IsRelease = isReleaseOrStronger(Ord);
// Since the intrinsics must have legal type, the i128 intrinsics take two
// parameters: "i64, i64". We must marshal Val into the appropriate form
// before the call.
if (Val->getType()->getPrimitiveSizeInBits() == 128) {
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
Function *Stxr = Intrinsic::getDeclaration(M, Int);
Type *Int64Ty = Type::getInt64Ty(M->getContext());
Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
}
Intrinsic::ID Int =
IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
Type *Tys[] = { Addr->getType() };
Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
return Builder.CreateCall(Stxr,
{Builder.CreateZExtOrBitCast(
Val, Stxr->getFunctionType()->getParamType(0)),
Addr});
}
bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
return Ty->isArrayTy();
}
bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
EVT) const {
return false;
}
Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
if (!Subtarget->isTargetAndroid())
return TargetLowering::getIRStackGuard(IRB);
// Android provides a fixed TLS slot for the stack cookie. See the definition
// of TLS_SLOT_STACK_GUARD in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
const unsigned TlsOffset = 0x28;
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
}
Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
if (!Subtarget->isTargetAndroid())
return TargetLowering::getSafeStackPointerLocation(IRB);
// Android provides a fixed TLS slot for the SafeStack pointer. See the
// definition of TLS_SLOT_SAFESTACK in
// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
const unsigned TlsOffset = 0x48;
Module *M = IRB.GetInsertBlock()->getParent()->getParent();
Function *ThreadPointerFunc =
Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
return IRB.CreatePointerCast(
IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
}
void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
// Update IsSplitCSR in AArch64unctionInfo.
AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
AFI->setIsSplitCSR(true);
}
void AArch64TargetLowering::insertCopiesSplitCSR(
MachineBasicBlock *Entry,
const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
if (!IStart)
return;
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
MachineBasicBlock::iterator MBBI = Entry->begin();
for (const MCPhysReg *I = IStart; *I; ++I) {
const TargetRegisterClass *RC = nullptr;
if (AArch64::GPR64RegClass.contains(*I))
RC = &AArch64::GPR64RegClass;
else if (AArch64::FPR64RegClass.contains(*I))
RC = &AArch64::FPR64RegClass;
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
unsigned NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
// FIXME: this currently does not emit CFI pseudo-instructions, it works
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
assert(Entry->getParent()->getFunction()->hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
.addReg(*I);
// Insert the copy-back instructions right before the terminator.
for (auto *Exit : Exits)
BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
TII->get(TargetOpcode::COPY), *I)
.addReg(NewVR);
}
}
bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
// Integer division on AArch64 is expensive. However, when aggressively
// optimizing for code size, we prefer to use a div instruction, as it is
// usually smaller than the alternative sequence.
// The exception to this is vector division. Since AArch64 doesn't have vector
// integer division, leaving the division as-is is a loss even in terms of
// size, because it will have to be scalarized, while the alternative code
// sequence can be performed in vector form.
bool OptSize =
Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
return OptSize && !VT.isVector();
}
Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86ExpandPseudo.cpp (revision 313643)
@@ -1,297 +1,278 @@
//===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass that expands pseudo instructions into target
// instructions to allow proper scheduling, if-conversion, other late
// optimizations, or simply the encoding of the instructions.
//
//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86FrameLowering.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "llvm/Analysis/EHPersonalities.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved.
#include "llvm/IR/GlobalValue.h"
using namespace llvm;
#define DEBUG_TYPE "x86-pseudo"
namespace {
class X86ExpandPseudo : public MachineFunctionPass {
public:
static char ID;
X86ExpandPseudo() : MachineFunctionPass(ID) {}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addPreservedID(MachineLoopInfoID);
AU.addPreservedID(MachineDominatorsID);
MachineFunctionPass::getAnalysisUsage(AU);
}
const X86Subtarget *STI;
const X86InstrInfo *TII;
const X86RegisterInfo *TRI;
const X86MachineFunctionInfo *X86FI;
const X86FrameLowering *X86FL;
bool runOnMachineFunction(MachineFunction &Fn) override;
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
StringRef getPassName() const override {
return "X86 pseudo instruction expansion pass";
}
private:
bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI);
bool ExpandMBB(MachineBasicBlock &MBB);
};
char X86ExpandPseudo::ID = 0;
} // End anonymous namespace.
/// If \p MBBI is a pseudo instruction, this method expands
/// it to the corresponding (sequence of) actual instruction(s).
/// \returns true if \p MBBI has been expanded.
bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) {
MachineInstr &MI = *MBBI;
unsigned Opcode = MI.getOpcode();
DebugLoc DL = MBBI->getDebugLoc();
switch (Opcode) {
default:
return false;
case X86::TCRETURNdi:
- case X86::TCRETURNdicc:
case X86::TCRETURNri:
case X86::TCRETURNmi:
case X86::TCRETURNdi64:
- case X86::TCRETURNdi64cc:
case X86::TCRETURNri64:
case X86::TCRETURNmi64: {
bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64;
MachineOperand &JumpTarget = MBBI->getOperand(0);
MachineOperand &StackAdjust = MBBI->getOperand(isMem ? 5 : 1);
assert(StackAdjust.isImm() && "Expecting immediate value.");
// Adjust stack pointer.
int StackAdj = StackAdjust.getImm();
int MaxTCDelta = X86FI->getTCReturnAddrDelta();
int Offset = 0;
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
// Incoporate the retaddr area.
Offset = StackAdj - MaxTCDelta;
assert(Offset >= 0 && "Offset should never be negative");
- if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) {
- assert(Offset == 0 && "Conditional tail call cannot adjust the stack.");
- }
-
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
X86FL->emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
}
// Jump to label or value in register.
bool IsWin64 = STI->isTargetWin64();
- if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc ||
- Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) {
+ if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdi64) {
unsigned Op;
switch (Opcode) {
case X86::TCRETURNdi:
Op = X86::TAILJMPd;
break;
- case X86::TCRETURNdicc:
- Op = X86::TAILJMPd_CC;
- break;
- case X86::TCRETURNdi64cc:
- assert(!IsWin64 && "Conditional tail calls confuse the Win64 unwinder.");
- // TODO: We could do it for Win64 "leaf" functions though; PR30337.
- Op = X86::TAILJMPd64_CC;
- break;
default:
// Note: Win64 uses REX prefixes indirect jumps out of functions, but
// not direct ones.
Op = X86::TAILJMPd64;
break;
}
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
if (JumpTarget.isGlobal()) {
MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
JumpTarget.getTargetFlags());
} else {
assert(JumpTarget.isSymbol());
MIB.addExternalSymbol(JumpTarget.getSymbolName(),
JumpTarget.getTargetFlags());
}
- if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) {
- MIB.addImm(MBBI->getOperand(2).getImm());
- }
-
} else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) {
unsigned Op = (Opcode == X86::TCRETURNmi)
? X86::TAILJMPm
: (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64);
MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op));
for (unsigned i = 0; i != 5; ++i)
MIB.addOperand(MBBI->getOperand(i));
} else if (Opcode == X86::TCRETURNri64) {
BuildMI(MBB, MBBI, DL,
TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64))
.addReg(JumpTarget.getReg(), RegState::Kill);
} else {
BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr))
.addReg(JumpTarget.getReg(), RegState::Kill);
}
MachineInstr &NewMI = *std::prev(MBBI);
NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
return true;
}
case X86::EH_RETURN:
case X86::EH_RETURN64: {
MachineOperand &DestAddr = MBBI->getOperand(0);
assert(DestAddr.isReg() && "Offset should be in register!");
const bool Uses64BitFramePtr =
STI->isTarget64BitLP64() || STI->isTargetNaCl64();
unsigned StackPtr = TRI->getStackRegister();
BuildMI(MBB, MBBI, DL,
TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr)
.addReg(DestAddr.getReg());
// The EH_RETURN pseudo is really removed during the MC Lowering.
return true;
}
case X86::IRET: {
// Adjust stack to erase error code
int64_t StackAdj = MBBI->getOperand(0).getImm();
X86FL->emitSPUpdate(MBB, MBBI, StackAdj, true);
// Replace pseudo with machine iret
BuildMI(MBB, MBBI, DL,
TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32));
MBB.erase(MBBI);
return true;
}
case X86::RET: {
// Adjust stack to erase error code
int64_t StackAdj = MBBI->getOperand(0).getImm();
MachineInstrBuilder MIB;
if (StackAdj == 0) {
MIB = BuildMI(MBB, MBBI, DL,
TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL));
} else if (isUInt<16>(StackAdj)) {
MIB = BuildMI(MBB, MBBI, DL,
TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL))
.addImm(StackAdj);
} else {
assert(!STI->is64Bit() &&
"shouldn't need to do this for x86_64 targets!");
// A ret can only handle immediates as big as 2**16-1. If we need to pop
// off bytes before the return address, we must do it manually.
BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define);
X86FL->emitSPUpdate(MBB, MBBI, StackAdj, /*InEpilogue=*/true);
BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX);
MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL));
}
for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I)
MIB.addOperand(MBBI->getOperand(I));
MBB.erase(MBBI);
return true;
}
case X86::EH_RESTORE: {
// Restore ESP and EBP, and optionally ESI if required.
bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
MBB.getParent()->getFunction()->getPersonalityFn()));
X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
MBBI->eraseFromParent();
return true;
}
case X86::LCMPXCHG8B_SAVE_EBX:
case X86::LCMPXCHG16B_SAVE_RBX: {
// Perform the following transformation.
// SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx
// =>
// [E|R]BX = InArg
// actualcmpxchg Addr
// [E|R]BX = SaveRbx
const MachineOperand &InArg = MBBI->getOperand(6);
unsigned SaveRbx = MBBI->getOperand(7).getReg();
unsigned ActualInArg =
Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
// Copy the input argument of the pseudo into the argument of the
// actual instruction.
TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, InArg.getReg(),
InArg.isKill());
// Create the actual instruction.
unsigned ActualOpc =
Opcode == X86::LCMPXCHG8B_SAVE_EBX ? X86::LCMPXCHG8B : X86::LCMPXCHG16B;
MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(ActualOpc));
// Copy the operands related to the address.
for (unsigned Idx = 1; Idx < 6; ++Idx)
NewInstr->addOperand(MBBI->getOperand(Idx));
// Finally, restore the value of RBX.
TII->copyPhysReg(MBB, MBBI, DL, ActualInArg, SaveRbx,
/*SrcIsKill*/ true);
// Delete the pseudo.
MBBI->eraseFromParent();
return true;
}
}
llvm_unreachable("Previous switch has a fallthrough?");
}
/// Expand all pseudo instructions contained in \p MBB.
/// \returns true if any expansion occurred for \p MBB.
bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
// MBBI may be invalidated by the expansion.
MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
while (MBBI != E) {
MachineBasicBlock::iterator NMBBI = std::next(MBBI);
Modified |= ExpandMI(MBB, MBBI);
MBBI = NMBBI;
}
return Modified;
}
bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
STI = &static_cast<const X86Subtarget &>(MF.getSubtarget());
TII = STI->getInstrInfo();
TRI = STI->getRegisterInfo();
X86FI = MF.getInfo<X86MachineFunctionInfo>();
X86FL = STI->getFrameLowering();
bool Modified = false;
for (MachineBasicBlock &MBB : MF)
Modified |= ExpandMBB(MBB);
return Modified;
}
/// Returns an instance of the pseudo instruction expansion pass.
FunctionPass *llvm::createX86ExpandPseudoPass() {
return new X86ExpandPseudo();
}
Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrControl.td (revision 313643)
@@ -1,358 +1,327 @@
//===-- X86InstrControl.td - Control Flow Instructions -----*- tablegen -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 jump, return, call, and related instructions.
//
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
// Control Flow Instructions.
//
// Return instructions.
//
// The X86retflag return instructions are variadic because we may add ST0 and
// ST1 arguments when returning values on the x87 stack.
let isTerminator = 1, isReturn = 1, isBarrier = 1,
hasCtrlDep = 1, FPForm = SpecialFP, SchedRW = [WriteJumpLd] in {
def RETL : I <0xC3, RawFrm, (outs), (ins variable_ops),
"ret{l}", [], IIC_RET>, OpSize32,
Requires<[Not64BitMode]>;
def RETQ : I <0xC3, RawFrm, (outs), (ins variable_ops),
"ret{q}", [], IIC_RET>, OpSize32,
Requires<[In64BitMode]>;
def RETW : I <0xC3, RawFrm, (outs), (ins),
"ret{w}",
[], IIC_RET>, OpSize16;
def RETIL : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
"ret{l}\t$amt",
[], IIC_RET_IMM>, OpSize32,
Requires<[Not64BitMode]>;
def RETIQ : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
"ret{q}\t$amt",
[], IIC_RET_IMM>, OpSize32,
Requires<[In64BitMode]>;
def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
"ret{w}\t$amt",
[], IIC_RET_IMM>, OpSize16;
def LRETL : I <0xCB, RawFrm, (outs), (ins),
"{l}ret{l|f}", [], IIC_RET>, OpSize32;
def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
"{l}ret{|f}q", [], IIC_RET>, Requires<[In64BitMode]>;
def LRETW : I <0xCB, RawFrm, (outs), (ins),
"{l}ret{w|f}", [], IIC_RET>, OpSize16;
def LRETIL : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
"{l}ret{l|f}\t$amt", [], IIC_RET>, OpSize32;
def LRETIQ : RIi16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
"{l}ret{|f}q\t$amt", [], IIC_RET>, Requires<[In64BitMode]>;
def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
"{l}ret{w|f}\t$amt", [], IIC_RET>, OpSize16;
// The machine return from interrupt instruction, but sometimes we need to
// perform a post-epilogue stack adjustment. Codegen emits the pseudo form
// which expands to include an SP adjustment if necessary.
def IRET16 : I <0xcf, RawFrm, (outs), (ins), "iret{w}", [], IIC_IRET>,
OpSize16;
def IRET32 : I <0xcf, RawFrm, (outs), (ins), "iret{l|d}", [],
IIC_IRET>, OpSize32;
def IRET64 : RI <0xcf, RawFrm, (outs), (ins), "iretq", [],
IIC_IRET>, Requires<[In64BitMode]>;
let isCodeGenOnly = 1 in
def IRET : PseudoI<(outs), (ins i32imm:$adj), [(X86iret timm:$adj)]>;
def RET : PseudoI<(outs), (ins i32imm:$adj, variable_ops), [(X86retflag timm:$adj)]>;
}
// Unconditional branches.
let isBarrier = 1, isBranch = 1, isTerminator = 1, SchedRW = [WriteJump] in {
def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
"jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def JMP_2 : Ii16PCRel<0xE9, RawFrm, (outs), (ins brtarget16:$dst),
"jmp\t$dst", [], IIC_JMP_REL>, OpSize16;
def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget32:$dst),
"jmp\t$dst", [], IIC_JMP_REL>, OpSize32;
}
}
// Conditional Branches.
let isBranch = 1, isTerminator = 1, Uses = [EFLAGS], SchedRW = [WriteJump] in {
multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm,
[(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>;
let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
def _2 : Ii16PCRel<opc4, RawFrm, (outs), (ins brtarget16:$dst), asm,
[], IIC_Jcc>, OpSize16, TB;
def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget32:$dst), asm,
[], IIC_Jcc>, TB, OpSize32;
}
}
}
defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
defm JNO : ICBr<0x71, 0x81, "jno\t$dst", X86_COND_NO>;
defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
// jcx/jecx/jrcx instructions.
let isBranch = 1, isTerminator = 1, hasSideEffects = 0, SchedRW = [WriteJump] in {
// These are the 32-bit versions of this instruction for the asmparser. In
// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
// jecxz.
let Uses = [CX] in
def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
"jcxz\t$dst", [], IIC_JCXZ>, AdSize16,
Requires<[Not64BitMode]>;
let Uses = [ECX] in
def JECXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
"jecxz\t$dst", [], IIC_JCXZ>, AdSize32;
let Uses = [RCX] in
def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
"jrcxz\t$dst", [], IIC_JCXZ>, AdSize64,
Requires<[In64BitMode]>;
}
// Indirect branches
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
def JMP16r : I<0xFF, MRM4r, (outs), (ins GR16:$dst), "jmp{w}\t{*}$dst",
[(brind GR16:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
OpSize16, Sched<[WriteJump]>;
def JMP16m : I<0xFF, MRM4m, (outs), (ins i16mem:$dst), "jmp{w}\t{*}$dst",
[(brind (loadi16 addr:$dst))], IIC_JMP_MEM>,
Requires<[Not64BitMode]>, OpSize16, Sched<[WriteJumpLd]>;
def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
[(brind GR32:$dst)], IIC_JMP_REG>, Requires<[Not64BitMode]>,
OpSize32, Sched<[WriteJump]>;
def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
[(brind (loadi32 addr:$dst))], IIC_JMP_MEM>,
Requires<[Not64BitMode]>, OpSize32, Sched<[WriteJumpLd]>;
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
[(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>,
Sched<[WriteJump]>;
def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
[(brind (loadi64 addr:$dst))], IIC_JMP_MEM>,
Requires<[In64BitMode]>, Sched<[WriteJumpLd]>;
let Predicates = [Not64BitMode] in {
def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
"ljmp{w}\t$seg, $off", [],
IIC_JMP_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
"ljmp{l}\t$seg, $off", [],
IIC_JMP_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
}
def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
"ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>,
Sched<[WriteJump]>;
def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
"ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
Sched<[WriteJumpLd]>;
def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
"ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
Sched<[WriteJumpLd]>;
}
// Loop instructions
let SchedRW = [WriteJump] in {
def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
}
//===----------------------------------------------------------------------===//
// Call Instructions...
//
let isCall = 1 in
// All calls clobber the non-callee saved registers. ESP is marked as
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead. Uses for argument
// registers are added manually.
let Uses = [ESP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i32imm_pcrel:$dst),
"call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
Requires<[Not64BitMode]>, Sched<[WriteJump]>;
let hasSideEffects = 0 in
def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
(outs), (ins i16imm_pcrel:$dst),
"call{w}\t$dst", [], IIC_CALL_RI>, OpSize16,
Sched<[WriteJump]>;
def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
"call{w}\t{*}$dst", [(X86call GR16:$dst)], IIC_CALL_RI>,
OpSize16, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
def CALL16m : I<0xFF, MRM2m, (outs), (ins i16mem:$dst),
"call{w}\t{*}$dst", [(X86call (loadi16 addr:$dst))],
IIC_CALL_MEM>, OpSize16,
Requires<[Not64BitMode,FavorMemIndirectCall]>,
Sched<[WriteJumpLd]>;
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
OpSize32, Requires<[Not64BitMode]>, Sched<[WriteJump]>;
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))],
IIC_CALL_MEM>, OpSize32,
Requires<[Not64BitMode,FavorMemIndirectCall]>,
Sched<[WriteJumpLd]>;
let Predicates = [Not64BitMode] in {
def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
(ins i16imm:$off, i16imm:$seg),
"lcall{w}\t$seg, $off", [],
IIC_CALL_FAR_PTR>, OpSize16, Sched<[WriteJump]>;
def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
(ins i32imm:$off, i16imm:$seg),
"lcall{l}\t$seg, $off", [],
IIC_CALL_FAR_PTR>, OpSize32, Sched<[WriteJump]>;
}
def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
"lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
Sched<[WriteJumpLd]>;
def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
"lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
Sched<[WriteJumpLd]>;
}
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
let Uses = [ESP] in {
def TCRETURNdi : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset), []>;
def TCRETURNri : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
let mayLoad = 1 in
def TCRETURNmi : PseudoI<(outs),
(ins i32mem_TC:$dst, i32imm:$offset), []>;
// FIXME: The should be pseudo instructions that are lowered when going to
// mcinst.
def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
(ins i32imm_pcrel:$dst),
"jmp\t$dst",
[], IIC_JMP_REL>;
def TAILJMPr : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
let mayLoad = 1 in
def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
"jmp{l}\t{*}$dst", [], IIC_JMP_MEM>;
}
-// Conditional tail calls are similar to the above, but they are branches
-// rather than barriers, and they use EFLAGS.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [ESP, EFLAGS] in {
- def TCRETURNdicc : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
- // This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i32imm_pcrel:$dst, i32imm:$cond),
- "",
- [], IIC_JMP_REL>;
-}
-
-
//===----------------------------------------------------------------------===//
// Call Instructions...
//
// RSP is marked as a use to prevent stack-pointer assignments that appear
// immediately before calls from potentially appearing dead. Uses for argument
// registers are added manually.
let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
// NOTE: this pattern doesn't match "X86call imm", because we do not know
// that the offset between an arbitrary immediate and the call will fit in
// the 32-bit pcrel field that we have.
def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i64i32imm_pcrel:$dst),
"call{q}\t$dst", [], IIC_CALL_RI>, OpSize32,
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
"call{q}\t{*}$dst", [(X86call GR64:$dst)],
IIC_CALL_RI>,
Requires<[In64BitMode]>;
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
IIC_CALL_MEM>,
Requires<[In64BitMode,FavorMemIndirectCall]>;
def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
"lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
SchedRW = [WriteJump] in {
def TCRETURNdi64 : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset),
[]>;
def TCRETURNri64 : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
(ins i64mem_TC:$dst, i32imm:$offset), []>;
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
"jmp\t$dst", [], IIC_JMP_REL>;
def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
let mayLoad = 1 in
def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
"jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
// Win64 wants indirect jumps leaving the function to have a REX_W prefix.
let hasREX_WPrefix = 1 in {
def TAILJMPr64_REX : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst),
"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
let mayLoad = 1 in
def TAILJMPm64_REX : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
"rex64 jmp{q}\t{*}$dst", [], IIC_JMP_MEM>;
}
-}
-
-// Conditional tail calls are similar to the above, but they are branches
-// rather than barriers, and they use EFLAGS.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
- isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [RSP, EFLAGS] in {
- def TCRETURNdi64cc : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset,
- i32imm:$cond), []>;
-
- // This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd64_CC : Ii32PCRel<0x80, RawFrm, (outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$cond),
- "",
- [], IIC_JMP_REL>;
}
Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.cpp (revision 313643)
@@ -1,9746 +1,9667 @@
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the X86 implementation of the TargetInstrInfo class.
//
//===----------------------------------------------------------------------===//
#include "X86InstrInfo.h"
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetOptions.h"
using namespace llvm;
#define DEBUG_TYPE "x86-instr-info"
#define GET_INSTRINFO_CTOR_DTOR
#include "X86GenInstrInfo.inc"
static cl::opt<bool>
NoFusing("disable-spill-fusing",
cl::desc("Disable fusing of spill code into instructions"));
static cl::opt<bool>
PrintFailedFusing("print-failed-fuse-candidates",
cl::desc("Print instructions that the allocator wants to"
" fuse, but the X86 backend currently can't"),
cl::Hidden);
static cl::opt<bool>
ReMatPICStubLoad("remat-pic-stub-load",
cl::desc("Re-materialize load from stub in PIC mode"),
cl::init(false), cl::Hidden);
static cl::opt<unsigned>
PartialRegUpdateClearance("partial-reg-update-clearance",
cl::desc("Clearance between two register writes "
"for inserting XOR to avoid partial "
"register update"),
cl::init(64), cl::Hidden);
static cl::opt<unsigned>
UndefRegClearance("undef-reg-clearance",
cl::desc("How many idle instructions we would like before "
"certain undef register reads"),
cl::init(128), cl::Hidden);
enum {
// Select which memory operand is being unfolded.
// (stored in bits 0 - 3)
TB_INDEX_0 = 0,
TB_INDEX_1 = 1,
TB_INDEX_2 = 2,
TB_INDEX_3 = 3,
TB_INDEX_4 = 4,
TB_INDEX_MASK = 0xf,
// Do not insert the reverse map (MemOp -> RegOp) into the table.
// This may be needed because there is a many -> one mapping.
TB_NO_REVERSE = 1 << 4,
// Do not insert the forward map (RegOp -> MemOp) into the table.
// This is needed for Native Client, which prohibits branch
// instructions from using a memory operand.
TB_NO_FORWARD = 1 << 5,
TB_FOLDED_LOAD = 1 << 6,
TB_FOLDED_STORE = 1 << 7,
// Minimum alignment required for load/store.
// Used for RegOp->MemOp conversion.
// (stored in bits 8 - 15)
TB_ALIGN_SHIFT = 8,
TB_ALIGN_NONE = 0 << TB_ALIGN_SHIFT,
TB_ALIGN_16 = 16 << TB_ALIGN_SHIFT,
TB_ALIGN_32 = 32 << TB_ALIGN_SHIFT,
TB_ALIGN_64 = 64 << TB_ALIGN_SHIFT,
TB_ALIGN_MASK = 0xff << TB_ALIGN_SHIFT
};
struct X86MemoryFoldTableEntry {
uint16_t RegOp;
uint16_t MemOp;
uint16_t Flags;
};
// Pin the vtable to this file.
void X86InstrInfo::anchor() {}
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
: X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
: X86::ADJCALLSTACKDOWN32),
(STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
: X86::ADJCALLSTACKUP32),
X86::CATCHRET,
(STI.is64Bit() ? X86::RETQ : X86::RETL)),
Subtarget(STI), RI(STI.getTargetTriple()) {
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
{ X86::ADC32ri, X86::ADC32mi, 0 },
{ X86::ADC32ri8, X86::ADC32mi8, 0 },
{ X86::ADC32rr, X86::ADC32mr, 0 },
{ X86::ADC64ri32, X86::ADC64mi32, 0 },
{ X86::ADC64ri8, X86::ADC64mi8, 0 },
{ X86::ADC64rr, X86::ADC64mr, 0 },
{ X86::ADD16ri, X86::ADD16mi, 0 },
{ X86::ADD16ri8, X86::ADD16mi8, 0 },
{ X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
{ X86::ADD16ri8_DB, X86::ADD16mi8, TB_NO_REVERSE },
{ X86::ADD16rr, X86::ADD16mr, 0 },
{ X86::ADD16rr_DB, X86::ADD16mr, TB_NO_REVERSE },
{ X86::ADD32ri, X86::ADD32mi, 0 },
{ X86::ADD32ri8, X86::ADD32mi8, 0 },
{ X86::ADD32ri_DB, X86::ADD32mi, TB_NO_REVERSE },
{ X86::ADD32ri8_DB, X86::ADD32mi8, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32mr, 0 },
{ X86::ADD32rr_DB, X86::ADD32mr, TB_NO_REVERSE },
{ X86::ADD64ri32, X86::ADD64mi32, 0 },
{ X86::ADD64ri8, X86::ADD64mi8, 0 },
{ X86::ADD64ri32_DB,X86::ADD64mi32, TB_NO_REVERSE },
{ X86::ADD64ri8_DB, X86::ADD64mi8, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64mr, 0 },
{ X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
{ X86::ADD8ri, X86::ADD8mi, 0 },
{ X86::ADD8rr, X86::ADD8mr, 0 },
{ X86::AND16ri, X86::AND16mi, 0 },
{ X86::AND16ri8, X86::AND16mi8, 0 },
{ X86::AND16rr, X86::AND16mr, 0 },
{ X86::AND32ri, X86::AND32mi, 0 },
{ X86::AND32ri8, X86::AND32mi8, 0 },
{ X86::AND32rr, X86::AND32mr, 0 },
{ X86::AND64ri32, X86::AND64mi32, 0 },
{ X86::AND64ri8, X86::AND64mi8, 0 },
{ X86::AND64rr, X86::AND64mr, 0 },
{ X86::AND8ri, X86::AND8mi, 0 },
{ X86::AND8rr, X86::AND8mr, 0 },
{ X86::DEC16r, X86::DEC16m, 0 },
{ X86::DEC32r, X86::DEC32m, 0 },
{ X86::DEC64r, X86::DEC64m, 0 },
{ X86::DEC8r, X86::DEC8m, 0 },
{ X86::INC16r, X86::INC16m, 0 },
{ X86::INC32r, X86::INC32m, 0 },
{ X86::INC64r, X86::INC64m, 0 },
{ X86::INC8r, X86::INC8m, 0 },
{ X86::NEG16r, X86::NEG16m, 0 },
{ X86::NEG32r, X86::NEG32m, 0 },
{ X86::NEG64r, X86::NEG64m, 0 },
{ X86::NEG8r, X86::NEG8m, 0 },
{ X86::NOT16r, X86::NOT16m, 0 },
{ X86::NOT32r, X86::NOT32m, 0 },
{ X86::NOT64r, X86::NOT64m, 0 },
{ X86::NOT8r, X86::NOT8m, 0 },
{ X86::OR16ri, X86::OR16mi, 0 },
{ X86::OR16ri8, X86::OR16mi8, 0 },
{ X86::OR16rr, X86::OR16mr, 0 },
{ X86::OR32ri, X86::OR32mi, 0 },
{ X86::OR32ri8, X86::OR32mi8, 0 },
{ X86::OR32rr, X86::OR32mr, 0 },
{ X86::OR64ri32, X86::OR64mi32, 0 },
{ X86::OR64ri8, X86::OR64mi8, 0 },
{ X86::OR64rr, X86::OR64mr, 0 },
{ X86::OR8ri, X86::OR8mi, 0 },
{ X86::OR8rr, X86::OR8mr, 0 },
{ X86::ROL16r1, X86::ROL16m1, 0 },
{ X86::ROL16rCL, X86::ROL16mCL, 0 },
{ X86::ROL16ri, X86::ROL16mi, 0 },
{ X86::ROL32r1, X86::ROL32m1, 0 },
{ X86::ROL32rCL, X86::ROL32mCL, 0 },
{ X86::ROL32ri, X86::ROL32mi, 0 },
{ X86::ROL64r1, X86::ROL64m1, 0 },
{ X86::ROL64rCL, X86::ROL64mCL, 0 },
{ X86::ROL64ri, X86::ROL64mi, 0 },
{ X86::ROL8r1, X86::ROL8m1, 0 },
{ X86::ROL8rCL, X86::ROL8mCL, 0 },
{ X86::ROL8ri, X86::ROL8mi, 0 },
{ X86::ROR16r1, X86::ROR16m1, 0 },
{ X86::ROR16rCL, X86::ROR16mCL, 0 },
{ X86::ROR16ri, X86::ROR16mi, 0 },
{ X86::ROR32r1, X86::ROR32m1, 0 },
{ X86::ROR32rCL, X86::ROR32mCL, 0 },
{ X86::ROR32ri, X86::ROR32mi, 0 },
{ X86::ROR64r1, X86::ROR64m1, 0 },
{ X86::ROR64rCL, X86::ROR64mCL, 0 },
{ X86::ROR64ri, X86::ROR64mi, 0 },
{ X86::ROR8r1, X86::ROR8m1, 0 },
{ X86::ROR8rCL, X86::ROR8mCL, 0 },
{ X86::ROR8ri, X86::ROR8mi, 0 },
{ X86::SAR16r1, X86::SAR16m1, 0 },
{ X86::SAR16rCL, X86::SAR16mCL, 0 },
{ X86::SAR16ri, X86::SAR16mi, 0 },
{ X86::SAR32r1, X86::SAR32m1, 0 },
{ X86::SAR32rCL, X86::SAR32mCL, 0 },
{ X86::SAR32ri, X86::SAR32mi, 0 },
{ X86::SAR64r1, X86::SAR64m1, 0 },
{ X86::SAR64rCL, X86::SAR64mCL, 0 },
{ X86::SAR64ri, X86::SAR64mi, 0 },
{ X86::SAR8r1, X86::SAR8m1, 0 },
{ X86::SAR8rCL, X86::SAR8mCL, 0 },
{ X86::SAR8ri, X86::SAR8mi, 0 },
{ X86::SBB32ri, X86::SBB32mi, 0 },
{ X86::SBB32ri8, X86::SBB32mi8, 0 },
{ X86::SBB32rr, X86::SBB32mr, 0 },
{ X86::SBB64ri32, X86::SBB64mi32, 0 },
{ X86::SBB64ri8, X86::SBB64mi8, 0 },
{ X86::SBB64rr, X86::SBB64mr, 0 },
{ X86::SHL16r1, X86::SHL16m1, 0 },
{ X86::SHL16rCL, X86::SHL16mCL, 0 },
{ X86::SHL16ri, X86::SHL16mi, 0 },
{ X86::SHL32r1, X86::SHL32m1, 0 },
{ X86::SHL32rCL, X86::SHL32mCL, 0 },
{ X86::SHL32ri, X86::SHL32mi, 0 },
{ X86::SHL64r1, X86::SHL64m1, 0 },
{ X86::SHL64rCL, X86::SHL64mCL, 0 },
{ X86::SHL64ri, X86::SHL64mi, 0 },
{ X86::SHL8r1, X86::SHL8m1, 0 },
{ X86::SHL8rCL, X86::SHL8mCL, 0 },
{ X86::SHL8ri, X86::SHL8mi, 0 },
{ X86::SHLD16rrCL, X86::SHLD16mrCL, 0 },
{ X86::SHLD16rri8, X86::SHLD16mri8, 0 },
{ X86::SHLD32rrCL, X86::SHLD32mrCL, 0 },
{ X86::SHLD32rri8, X86::SHLD32mri8, 0 },
{ X86::SHLD64rrCL, X86::SHLD64mrCL, 0 },
{ X86::SHLD64rri8, X86::SHLD64mri8, 0 },
{ X86::SHR16r1, X86::SHR16m1, 0 },
{ X86::SHR16rCL, X86::SHR16mCL, 0 },
{ X86::SHR16ri, X86::SHR16mi, 0 },
{ X86::SHR32r1, X86::SHR32m1, 0 },
{ X86::SHR32rCL, X86::SHR32mCL, 0 },
{ X86::SHR32ri, X86::SHR32mi, 0 },
{ X86::SHR64r1, X86::SHR64m1, 0 },
{ X86::SHR64rCL, X86::SHR64mCL, 0 },
{ X86::SHR64ri, X86::SHR64mi, 0 },
{ X86::SHR8r1, X86::SHR8m1, 0 },
{ X86::SHR8rCL, X86::SHR8mCL, 0 },
{ X86::SHR8ri, X86::SHR8mi, 0 },
{ X86::SHRD16rrCL, X86::SHRD16mrCL, 0 },
{ X86::SHRD16rri8, X86::SHRD16mri8, 0 },
{ X86::SHRD32rrCL, X86::SHRD32mrCL, 0 },
{ X86::SHRD32rri8, X86::SHRD32mri8, 0 },
{ X86::SHRD64rrCL, X86::SHRD64mrCL, 0 },
{ X86::SHRD64rri8, X86::SHRD64mri8, 0 },
{ X86::SUB16ri, X86::SUB16mi, 0 },
{ X86::SUB16ri8, X86::SUB16mi8, 0 },
{ X86::SUB16rr, X86::SUB16mr, 0 },
{ X86::SUB32ri, X86::SUB32mi, 0 },
{ X86::SUB32ri8, X86::SUB32mi8, 0 },
{ X86::SUB32rr, X86::SUB32mr, 0 },
{ X86::SUB64ri32, X86::SUB64mi32, 0 },
{ X86::SUB64ri8, X86::SUB64mi8, 0 },
{ X86::SUB64rr, X86::SUB64mr, 0 },
{ X86::SUB8ri, X86::SUB8mi, 0 },
{ X86::SUB8rr, X86::SUB8mr, 0 },
{ X86::XOR16ri, X86::XOR16mi, 0 },
{ X86::XOR16ri8, X86::XOR16mi8, 0 },
{ X86::XOR16rr, X86::XOR16mr, 0 },
{ X86::XOR32ri, X86::XOR32mi, 0 },
{ X86::XOR32ri8, X86::XOR32mi8, 0 },
{ X86::XOR32rr, X86::XOR32mr, 0 },
{ X86::XOR64ri32, X86::XOR64mi32, 0 },
{ X86::XOR64ri8, X86::XOR64mi8, 0 },
{ X86::XOR64rr, X86::XOR64mr, 0 },
{ X86::XOR8ri, X86::XOR8mi, 0 },
{ X86::XOR8rr, X86::XOR8mr, 0 }
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 0, folded load and store, no alignment requirement.
Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
}
static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
{ X86::BT16ri8, X86::BT16mi8, TB_FOLDED_LOAD },
{ X86::BT32ri8, X86::BT32mi8, TB_FOLDED_LOAD },
{ X86::BT64ri8, X86::BT64mi8, TB_FOLDED_LOAD },
{ X86::CALL32r, X86::CALL32m, TB_FOLDED_LOAD },
{ X86::CALL64r, X86::CALL64m, TB_FOLDED_LOAD },
{ X86::CMP16ri, X86::CMP16mi, TB_FOLDED_LOAD },
{ X86::CMP16ri8, X86::CMP16mi8, TB_FOLDED_LOAD },
{ X86::CMP16rr, X86::CMP16mr, TB_FOLDED_LOAD },
{ X86::CMP32ri, X86::CMP32mi, TB_FOLDED_LOAD },
{ X86::CMP32ri8, X86::CMP32mi8, TB_FOLDED_LOAD },
{ X86::CMP32rr, X86::CMP32mr, TB_FOLDED_LOAD },
{ X86::CMP64ri32, X86::CMP64mi32, TB_FOLDED_LOAD },
{ X86::CMP64ri8, X86::CMP64mi8, TB_FOLDED_LOAD },
{ X86::CMP64rr, X86::CMP64mr, TB_FOLDED_LOAD },
{ X86::CMP8ri, X86::CMP8mi, TB_FOLDED_LOAD },
{ X86::CMP8rr, X86::CMP8mr, TB_FOLDED_LOAD },
{ X86::DIV16r, X86::DIV16m, TB_FOLDED_LOAD },
{ X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
{ X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
{ X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
{ X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
{ X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
{ X86::IDIV32r, X86::IDIV32m, TB_FOLDED_LOAD },
{ X86::IDIV64r, X86::IDIV64m, TB_FOLDED_LOAD },
{ X86::IDIV8r, X86::IDIV8m, TB_FOLDED_LOAD },
{ X86::IMUL16r, X86::IMUL16m, TB_FOLDED_LOAD },
{ X86::IMUL32r, X86::IMUL32m, TB_FOLDED_LOAD },
{ X86::IMUL64r, X86::IMUL64m, TB_FOLDED_LOAD },
{ X86::IMUL8r, X86::IMUL8m, TB_FOLDED_LOAD },
{ X86::JMP32r, X86::JMP32m, TB_FOLDED_LOAD },
{ X86::JMP64r, X86::JMP64m, TB_FOLDED_LOAD },
{ X86::MOV16ri, X86::MOV16mi, TB_FOLDED_STORE },
{ X86::MOV16rr, X86::MOV16mr, TB_FOLDED_STORE },
{ X86::MOV32ri, X86::MOV32mi, TB_FOLDED_STORE },
{ X86::MOV32rr, X86::MOV32mr, TB_FOLDED_STORE },
{ X86::MOV64ri32, X86::MOV64mi32, TB_FOLDED_STORE },
{ X86::MOV64rr, X86::MOV64mr, TB_FOLDED_STORE },
{ X86::MOV8ri, X86::MOV8mi, TB_FOLDED_STORE },
{ X86::MOV8rr, X86::MOV8mr, TB_FOLDED_STORE },
{ X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
{ X86::MOVAPDrr, X86::MOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQArr, X86::MOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUmr, TB_FOLDED_STORE },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, TB_FOLDED_STORE },
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, TB_FOLDED_STORE },
{ X86::MOVSDto64rr, X86::MOVSDto64mr, TB_FOLDED_STORE },
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, TB_FOLDED_STORE },
{ X86::MOVUPDrr, X86::MOVUPDmr, TB_FOLDED_STORE },
{ X86::MOVUPSrr, X86::MOVUPSmr, TB_FOLDED_STORE },
{ X86::MUL16r, X86::MUL16m, TB_FOLDED_LOAD },
{ X86::MUL32r, X86::MUL32m, TB_FOLDED_LOAD },
{ X86::MUL64r, X86::MUL64m, TB_FOLDED_LOAD },
{ X86::MUL8r, X86::MUL8m, TB_FOLDED_LOAD },
{ X86::PEXTRDrr, X86::PEXTRDmr, TB_FOLDED_STORE },
{ X86::PEXTRQrr, X86::PEXTRQmr, TB_FOLDED_STORE },
{ X86::PUSH16r, X86::PUSH16rmm, TB_FOLDED_LOAD },
{ X86::PUSH32r, X86::PUSH32rmm, TB_FOLDED_LOAD },
{ X86::PUSH64r, X86::PUSH64rmm, TB_FOLDED_LOAD },
{ X86::SETAEr, X86::SETAEm, TB_FOLDED_STORE },
{ X86::SETAr, X86::SETAm, TB_FOLDED_STORE },
{ X86::SETBEr, X86::SETBEm, TB_FOLDED_STORE },
{ X86::SETBr, X86::SETBm, TB_FOLDED_STORE },
{ X86::SETEr, X86::SETEm, TB_FOLDED_STORE },
{ X86::SETGEr, X86::SETGEm, TB_FOLDED_STORE },
{ X86::SETGr, X86::SETGm, TB_FOLDED_STORE },
{ X86::SETLEr, X86::SETLEm, TB_FOLDED_STORE },
{ X86::SETLr, X86::SETLm, TB_FOLDED_STORE },
{ X86::SETNEr, X86::SETNEm, TB_FOLDED_STORE },
{ X86::SETNOr, X86::SETNOm, TB_FOLDED_STORE },
{ X86::SETNPr, X86::SETNPm, TB_FOLDED_STORE },
{ X86::SETNSr, X86::SETNSm, TB_FOLDED_STORE },
{ X86::SETOr, X86::SETOm, TB_FOLDED_STORE },
{ X86::SETPr, X86::SETPm, TB_FOLDED_STORE },
{ X86::SETSr, X86::SETSm, TB_FOLDED_STORE },
{ X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD },
{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
{ X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
{ X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
// AVX 128-bit versions of foldable instructions
{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDrr, X86::VMOVAPDmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQArr, X86::VMOVDQAmr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQUrr, X86::VMOVDQUmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr, TB_FOLDED_STORE },
{ X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
{ X86::VMOVSDto64rr,X86::VMOVSDto64mr, TB_FOLDED_STORE },
{ X86::VMOVSS2DIrr, X86::VMOVSS2DImr, TB_FOLDED_STORE },
{ X86::VMOVUPDrr, X86::VMOVUPDmr, TB_FOLDED_STORE },
{ X86::VMOVUPSrr, X86::VMOVUPSmr, TB_FOLDED_STORE },
{ X86::VPEXTRDrr, X86::VPEXTRDmr, TB_FOLDED_STORE },
{ X86::VPEXTRQrr, X86::VPEXTRQmr, TB_FOLDED_STORE },
// AVX 256-bit foldable instructions
{ X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYmr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQUYrr, X86::VMOVDQUYmr, TB_FOLDED_STORE },
{ X86::VMOVUPDYrr, X86::VMOVUPDYmr, TB_FOLDED_STORE },
{ X86::VMOVUPSYrr, X86::VMOVUPSYmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions
{ X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
{ X86::VEXTRACTPSZrr, X86::VEXTRACTPSZmr, TB_FOLDED_STORE },
{ X86::VMOVPDI2DIZrr, X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
{ X86::VMOVAPDZrr, X86::VMOVAPDZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zmr, TB_FOLDED_STORE | TB_ALIGN_64 },
{ X86::VMOVUPDZrr, X86::VMOVUPDZmr, TB_FOLDED_STORE },
{ X86::VMOVUPSZrr, X86::VMOVUPSZmr, TB_FOLDED_STORE },
{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zmr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zmr, TB_FOLDED_STORE },
{ X86::VPMOVDBZrr, X86::VPMOVDBZmr, TB_FOLDED_STORE },
{ X86::VPMOVDWZrr, X86::VPMOVDWZmr, TB_FOLDED_STORE },
{ X86::VPMOVQDZrr, X86::VPMOVQDZmr, TB_FOLDED_STORE },
{ X86::VPMOVQWZrr, X86::VPMOVQWZmr, TB_FOLDED_STORE },
{ X86::VPMOVWBZrr, X86::VPMOVWBZmr, TB_FOLDED_STORE },
{ X86::VPMOVSDBZrr, X86::VPMOVSDBZmr, TB_FOLDED_STORE },
{ X86::VPMOVSDWZrr, X86::VPMOVSDWZmr, TB_FOLDED_STORE },
{ X86::VPMOVSQDZrr, X86::VPMOVSQDZmr, TB_FOLDED_STORE },
{ X86::VPMOVSQWZrr, X86::VPMOVSQWZmr, TB_FOLDED_STORE },
{ X86::VPMOVSWBZrr, X86::VPMOVSWBZmr, TB_FOLDED_STORE },
{ X86::VPMOVUSDBZrr, X86::VPMOVUSDBZmr, TB_FOLDED_STORE },
{ X86::VPMOVUSDWZrr, X86::VPMOVUSDWZmr, TB_FOLDED_STORE },
{ X86::VPMOVUSQDZrr, X86::VPMOVUSQDZmr, TB_FOLDED_STORE },
{ X86::VPMOVUSQWZrr, X86::VPMOVUSQWZmr, TB_FOLDED_STORE },
{ X86::VPMOVUSWBZrr, X86::VPMOVUSWBZmr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (256-bit versions)
{ X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
{ X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
{ X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
{ X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256mr, TB_FOLDED_STORE | TB_ALIGN_32 },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256mr, TB_FOLDED_STORE },
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256mr, TB_FOLDED_STORE },
{ X86::VPMOVDWZ256rr, X86::VPMOVDWZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVQDZ256rr, X86::VPMOVQDZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVWBZ256rr, X86::VPMOVWBZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVSDWZ256rr, X86::VPMOVSDWZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVSQDZ256rr, X86::VPMOVSQDZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVSWBZ256rr, X86::VPMOVSWBZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVUSDWZ256rr, X86::VPMOVUSDWZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVUSQDZ256rr, X86::VPMOVUSQDZ256mr, TB_FOLDED_STORE },
{ X86::VPMOVUSWBZ256rr, X86::VPMOVUSWBZ256mr, TB_FOLDED_STORE },
// AVX-512 foldable instructions (128-bit versions)
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128mr, TB_FOLDED_STORE },
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128mr, TB_FOLDED_STORE },
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128mr, TB_FOLDED_STORE },
// F16C foldable instructions
{ X86::VCVTPS2PHrr, X86::VCVTPS2PHmr, TB_FOLDED_STORE },
{ X86::VCVTPS2PHYrr, X86::VCVTPS2PHYmr, TB_FOLDED_STORE }
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
}
static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
{ X86::BSF16rr, X86::BSF16rm, 0 },
{ X86::BSF32rr, X86::BSF32rm, 0 },
{ X86::BSF64rr, X86::BSF64rm, 0 },
{ X86::BSR16rr, X86::BSR16rm, 0 },
{ X86::BSR32rr, X86::BSR32rm, 0 },
{ X86::BSR64rr, X86::BSR64rm, 0 },
{ X86::CMP16rr, X86::CMP16rm, 0 },
{ X86::CMP32rr, X86::CMP32rm, 0 },
{ X86::CMP64rr, X86::CMP64rm, 0 },
{ X86::CMP8rr, X86::CMP8rm, 0 },
{ X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
{ X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
{ X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
{ X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
{ X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
{ X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
{ X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
{ X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
{ X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
{ X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
{ X86::IMUL16rri, X86::IMUL16rmi, 0 },
{ X86::IMUL16rri8, X86::IMUL16rmi8, 0 },
{ X86::IMUL32rri, X86::IMUL32rmi, 0 },
{ X86::IMUL32rri8, X86::IMUL32rmi8, 0 },
{ X86::IMUL64rri32, X86::IMUL64rmi32, 0 },
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
{ X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
{ X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
{ X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
{ X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
{ X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
{ X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
{ X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
{ X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
{ X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
{ X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
{ X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
{ X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
{ X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
{ X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
{ X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
{ X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
{ X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
{ X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
{ X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
{ X86::MOV16rr, X86::MOV16rm, 0 },
{ X86::MOV32rr, X86::MOV32rm, 0 },
{ X86::MOV64rr, X86::MOV64rm, 0 },
{ X86::MOV64toPQIrr, X86::MOVQI2PQIrm, 0 },
{ X86::MOV64toSDrr, X86::MOV64toSDrm, 0 },
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, TB_ALIGN_16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, TB_ALIGN_16 },
{ X86::MOVDDUPrr, X86::MOVDDUPrm, TB_NO_REVERSE },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, TB_ALIGN_16 },
{ X86::MOVDQUrr, X86::MOVDQUrm, 0 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, TB_ALIGN_16 },
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, TB_ALIGN_16 },
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
{ X86::MOVSX32rr16, X86::MOVSX32rm16, 0 },
{ X86::MOVSX32rr8, X86::MOVSX32rm8, 0 },
{ X86::MOVSX64rr16, X86::MOVSX64rm16, 0 },
{ X86::MOVSX64rr32, X86::MOVSX64rm32, 0 },
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
{ X86::MOVUPDrr, X86::MOVUPDrm, 0 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
{ X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm, TB_NO_REVERSE },
{ X86::MOVZX16rr8, X86::MOVZX16rm8, 0 },
{ X86::MOVZX32rr16, X86::MOVZX32rm16, 0 },
{ X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
{ X86::MOVZX32rr8, X86::MOVZX32rm8, 0 },
{ X86::PABSBrr, X86::PABSBrm, TB_ALIGN_16 },
{ X86::PABSDrr, X86::PABSDrm, TB_ALIGN_16 },
{ X86::PABSWrr, X86::PABSWrm, TB_ALIGN_16 },
{ X86::PCMPESTRIrr, X86::PCMPESTRIrm, TB_ALIGN_16 },
{ X86::PCMPESTRM128rr, X86::PCMPESTRM128rm, TB_ALIGN_16 },
{ X86::PCMPISTRIrr, X86::PCMPISTRIrm, TB_ALIGN_16 },
{ X86::PCMPISTRM128rr, X86::PCMPISTRM128rm, TB_ALIGN_16 },
{ X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128, TB_ALIGN_16 },
{ X86::PMOVSXBDrr, X86::PMOVSXBDrm, TB_NO_REVERSE },
{ X86::PMOVSXBQrr, X86::PMOVSXBQrm, TB_NO_REVERSE },
{ X86::PMOVSXBWrr, X86::PMOVSXBWrm, TB_NO_REVERSE },
{ X86::PMOVSXDQrr, X86::PMOVSXDQrm, TB_NO_REVERSE },
{ X86::PMOVSXWDrr, X86::PMOVSXWDrm, TB_NO_REVERSE },
{ X86::PMOVSXWQrr, X86::PMOVSXWQrm, TB_NO_REVERSE },
{ X86::PMOVZXBDrr, X86::PMOVZXBDrm, TB_NO_REVERSE },
{ X86::PMOVZXBQrr, X86::PMOVZXBQrm, TB_NO_REVERSE },
{ X86::PMOVZXBWrr, X86::PMOVZXBWrm, TB_NO_REVERSE },
{ X86::PMOVZXDQrr, X86::PMOVZXDQrm, TB_NO_REVERSE },
{ X86::PMOVZXWDrr, X86::PMOVZXWDrm, TB_NO_REVERSE },
{ X86::PMOVZXWQrr, X86::PMOVZXWQrm, TB_NO_REVERSE },
{ X86::PSHUFDri, X86::PSHUFDmi, TB_ALIGN_16 },
{ X86::PSHUFHWri, X86::PSHUFHWmi, TB_ALIGN_16 },
{ X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16 },
{ X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16 },
{ X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16 },
{ X86::RCPSSr, X86::RCPSSm, 0 },
{ X86::RCPSSr_Int, X86::RCPSSm_Int, TB_NO_REVERSE },
{ X86::ROUNDPDr, X86::ROUNDPDm, TB_ALIGN_16 },
{ X86::ROUNDPSr, X86::ROUNDPSm, TB_ALIGN_16 },
{ X86::ROUNDSDr, X86::ROUNDSDm, 0 },
{ X86::ROUNDSSr, X86::ROUNDSSm, 0 },
{ X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16 },
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, TB_NO_REVERSE },
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
{ X86::SQRTSDr, X86::SQRTSDm, 0 },
{ X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
{ X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
{ X86::TEST16rr, X86::TEST16rm, 0 },
{ X86::TEST32rr, X86::TEST32rm, 0 },
{ X86::TEST64rr, X86::TEST64rm, 0 },
{ X86::TEST8rr, X86::TEST8rm, 0 },
// FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
{ X86::UCOMISDrr, X86::UCOMISDrm, 0 },
{ X86::UCOMISSrr, X86::UCOMISSrm, 0 },
// MMX version of foldable instructions
{ X86::MMX_CVTPD2PIirr, X86::MMX_CVTPD2PIirm, 0 },
{ X86::MMX_CVTPI2PDirr, X86::MMX_CVTPI2PDirm, 0 },
{ X86::MMX_CVTPS2PIirr, X86::MMX_CVTPS2PIirm, 0 },
{ X86::MMX_CVTTPD2PIirr, X86::MMX_CVTTPD2PIirm, 0 },
{ X86::MMX_CVTTPS2PIirr, X86::MMX_CVTTPS2PIirm, 0 },
{ X86::MMX_MOVD64to64rr, X86::MMX_MOVQ64rm, 0 },
{ X86::MMX_PABSBrr64, X86::MMX_PABSBrm64, 0 },
{ X86::MMX_PABSDrr64, X86::MMX_PABSDrm64, 0 },
{ X86::MMX_PABSWrr64, X86::MMX_PABSWrm64, 0 },
{ X86::MMX_PSHUFWri, X86::MMX_PSHUFWmi, 0 },
// 3DNow! version of foldable instructions
{ X86::PF2IDrr, X86::PF2IDrm, 0 },
{ X86::PF2IWrr, X86::PF2IWrm, 0 },
{ X86::PFRCPrr, X86::PFRCPrm, 0 },
{ X86::PFRSQRTrr, X86::PFRSQRTrm, 0 },
{ X86::PI2FDrr, X86::PI2FDrm, 0 },
{ X86::PI2FWrr, X86::PI2FWrm, 0 },
{ X86::PSWAPDrr, X86::PSWAPDrm, 0 },
// AVX 128-bit versions of foldable instructions
{ X86::Int_VCOMISDrr, X86::Int_VCOMISDrm, TB_NO_REVERSE },
{ X86::Int_VCOMISSrr, X86::Int_VCOMISSrm, TB_NO_REVERSE },
{ X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
{ X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
{ X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
{ X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
{ X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
{ X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
{ X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
{ X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
{ X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
{ X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
{ X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
{ X86::VCVTPD2PSrr, X86::VCVTPD2PSrm, 0 },
{ X86::VCVTPS2DQrr, X86::VCVTPS2DQrm, 0 },
{ X86::VCVTPS2PDrr, X86::VCVTPS2PDrm, TB_NO_REVERSE },
{ X86::VCVTTPD2DQrr, X86::VCVTTPD2DQrm, 0 },
{ X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
{ X86::VMOV64toPQIrr, X86::VMOVQI2PQIrm, 0 },
{ X86::VMOV64toSDrr, X86::VMOV64toSDrm, 0 },
{ X86::VMOVAPDrr, X86::VMOVAPDrm, TB_ALIGN_16 },
{ X86::VMOVAPSrr, X86::VMOVAPSrm, TB_ALIGN_16 },
{ X86::VMOVDDUPrr, X86::VMOVDDUPrm, TB_NO_REVERSE },
{ X86::VMOVDI2PDIrr, X86::VMOVDI2PDIrm, 0 },
{ X86::VMOVDI2SSrr, X86::VMOVDI2SSrm, 0 },
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
{ X86::VMOVDQUrr, X86::VMOVDQUrm, 0 },
{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, 0 },
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, 0 },
{ X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
{ X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm, TB_NO_REVERSE },
{ X86::VPABSBrr, X86::VPABSBrm, 0 },
{ X86::VPABSDrr, X86::VPABSDrm, 0 },
{ X86::VPABSWrr, X86::VPABSWrm, 0 },
{ X86::VPCMPESTRIrr, X86::VPCMPESTRIrm, 0 },
{ X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm, 0 },
{ X86::VPCMPISTRIrr, X86::VPCMPISTRIrm, 0 },
{ X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm, 0 },
{ X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128, 0 },
{ X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
{ X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
{ X86::VPMOVSXBDrr, X86::VPMOVSXBDrm, TB_NO_REVERSE },
{ X86::VPMOVSXBQrr, X86::VPMOVSXBQrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWrr, X86::VPMOVSXBWrm, TB_NO_REVERSE },
{ X86::VPMOVSXDQrr, X86::VPMOVSXDQrm, TB_NO_REVERSE },
{ X86::VPMOVSXWDrr, X86::VPMOVSXWDrm, TB_NO_REVERSE },
{ X86::VPMOVSXWQrr, X86::VPMOVSXWQrm, TB_NO_REVERSE },
{ X86::VPMOVZXBDrr, X86::VPMOVZXBDrm, TB_NO_REVERSE },
{ X86::VPMOVZXBQrr, X86::VPMOVZXBQrm, TB_NO_REVERSE },
{ X86::VPMOVZXBWrr, X86::VPMOVZXBWrm, TB_NO_REVERSE },
{ X86::VPMOVZXDQrr, X86::VPMOVZXDQrm, TB_NO_REVERSE },
{ X86::VPMOVZXWDrr, X86::VPMOVZXWDrm, TB_NO_REVERSE },
{ X86::VPMOVZXWQrr, X86::VPMOVZXWQrm, TB_NO_REVERSE },
{ X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
{ X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
{ X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
{ X86::VPTESTrr, X86::VPTESTrm, 0 },
{ X86::VRCPPSr, X86::VRCPPSm, 0 },
{ X86::VROUNDPDr, X86::VROUNDPDm, 0 },
{ X86::VROUNDPSr, X86::VROUNDPSm, 0 },
{ X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
{ X86::VSQRTPDr, X86::VSQRTPDm, 0 },
{ X86::VSQRTPSr, X86::VSQRTPSm, 0 },
{ X86::VTESTPDrr, X86::VTESTPDrm, 0 },
{ X86::VTESTPSrr, X86::VTESTPSrm, 0 },
{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
// AVX 256-bit foldable instructions
{ X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
{ X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
{ X86::VMOVAPSYrr, X86::VMOVAPSYrm, TB_ALIGN_32 },
{ X86::VMOVDDUPYrr, X86::VMOVDDUPYrm, 0 },
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
{ X86::VMOVDQUYrr, X86::VMOVDQUYrm, 0 },
{ X86::VMOVSLDUPYrr, X86::VMOVSLDUPYrm, 0 },
{ X86::VMOVSHDUPYrr, X86::VMOVSHDUPYrm, 0 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
{ X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
{ X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
{ X86::VPTESTYrr, X86::VPTESTYrm, 0 },
{ X86::VRCPPSYr, X86::VRCPPSYm, 0 },
{ X86::VROUNDYPDr, X86::VROUNDYPDm, 0 },
{ X86::VROUNDYPSr, X86::VROUNDYPSm, 0 },
{ X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
{ X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
{ X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
{ X86::VTESTPDYrr, X86::VTESTPDYrm, 0 },
{ X86::VTESTPSYrr, X86::VTESTPSYrm, 0 },
// AVX2 foldable instructions
// VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
// VBROADCASTS{SD}rm memory instructions were available from AVX1.
// TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
// on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
// so they don't need an equivalent limitation.
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
{ X86::VPABSBYrr, X86::VPABSBYrm, 0 },
{ X86::VPABSDYrr, X86::VPABSDYrm, 0 },
{ X86::VPABSWYrr, X86::VPABSWYrm, 0 },
{ X86::VPBROADCASTBrr, X86::VPBROADCASTBrm, TB_NO_REVERSE },
{ X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm, TB_NO_REVERSE },
{ X86::VPBROADCASTDrr, X86::VPBROADCASTDrm, TB_NO_REVERSE },
{ X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm, TB_NO_REVERSE },
{ X86::VPBROADCASTQrr, X86::VPBROADCASTQrm, TB_NO_REVERSE },
{ X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm, TB_NO_REVERSE },
{ X86::VPBROADCASTWrr, X86::VPBROADCASTWrm, TB_NO_REVERSE },
{ X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm, TB_NO_REVERSE },
{ X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
{ X86::VPERMQYri, X86::VPERMQYmi, 0 },
{ X86::VPMOVSXBDYrr, X86::VPMOVSXBDYrm, TB_NO_REVERSE },
{ X86::VPMOVSXBQYrr, X86::VPMOVSXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWYrr, X86::VPMOVSXBWYrm, 0 },
{ X86::VPMOVSXDQYrr, X86::VPMOVSXDQYrm, 0 },
{ X86::VPMOVSXWDYrr, X86::VPMOVSXWDYrm, 0 },
{ X86::VPMOVSXWQYrr, X86::VPMOVSXWQYrm, TB_NO_REVERSE },
{ X86::VPMOVZXBDYrr, X86::VPMOVZXBDYrm, TB_NO_REVERSE },
{ X86::VPMOVZXBQYrr, X86::VPMOVZXBQYrm, TB_NO_REVERSE },
{ X86::VPMOVZXBWYrr, X86::VPMOVZXBWYrm, 0 },
{ X86::VPMOVZXDQYrr, X86::VPMOVZXDQYrm, 0 },
{ X86::VPMOVZXWDYrr, X86::VPMOVZXWDYrm, 0 },
{ X86::VPMOVZXWQYrr, X86::VPMOVZXWQYrm, TB_NO_REVERSE },
{ X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
{ X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
{ X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
// XOP foldable instructions
{ X86::VFRCZPDrr, X86::VFRCZPDrm, 0 },
{ X86::VFRCZPDrrY, X86::VFRCZPDrmY, 0 },
{ X86::VFRCZPSrr, X86::VFRCZPSrm, 0 },
{ X86::VFRCZPSrrY, X86::VFRCZPSrmY, 0 },
{ X86::VFRCZSDrr, X86::VFRCZSDrm, 0 },
{ X86::VFRCZSSrr, X86::VFRCZSSrm, 0 },
{ X86::VPHADDBDrr, X86::VPHADDBDrm, 0 },
{ X86::VPHADDBQrr, X86::VPHADDBQrm, 0 },
{ X86::VPHADDBWrr, X86::VPHADDBWrm, 0 },
{ X86::VPHADDDQrr, X86::VPHADDDQrm, 0 },
{ X86::VPHADDWDrr, X86::VPHADDWDrm, 0 },
{ X86::VPHADDWQrr, X86::VPHADDWQrm, 0 },
{ X86::VPHADDUBDrr, X86::VPHADDUBDrm, 0 },
{ X86::VPHADDUBQrr, X86::VPHADDUBQrm, 0 },
{ X86::VPHADDUBWrr, X86::VPHADDUBWrm, 0 },
{ X86::VPHADDUDQrr, X86::VPHADDUDQrm, 0 },
{ X86::VPHADDUWDrr, X86::VPHADDUWDrm, 0 },
{ X86::VPHADDUWQrr, X86::VPHADDUWQrm, 0 },
{ X86::VPHSUBBWrr, X86::VPHSUBBWrm, 0 },
{ X86::VPHSUBDQrr, X86::VPHSUBDQrm, 0 },
{ X86::VPHSUBWDrr, X86::VPHSUBWDrm, 0 },
{ X86::VPROTBri, X86::VPROTBmi, 0 },
{ X86::VPROTBrr, X86::VPROTBmr, 0 },
{ X86::VPROTDri, X86::VPROTDmi, 0 },
{ X86::VPROTDrr, X86::VPROTDmr, 0 },
{ X86::VPROTQri, X86::VPROTQmi, 0 },
{ X86::VPROTQrr, X86::VPROTQmr, 0 },
{ X86::VPROTWri, X86::VPROTWmi, 0 },
{ X86::VPROTWrr, X86::VPROTWmr, 0 },
{ X86::VPSHABrr, X86::VPSHABmr, 0 },
{ X86::VPSHADrr, X86::VPSHADmr, 0 },
{ X86::VPSHAQrr, X86::VPSHAQmr, 0 },
{ X86::VPSHAWrr, X86::VPSHAWmr, 0 },
{ X86::VPSHLBrr, X86::VPSHLBmr, 0 },
{ X86::VPSHLDrr, X86::VPSHLDmr, 0 },
{ X86::VPSHLQrr, X86::VPSHLQmr, 0 },
{ X86::VPSHLWrr, X86::VPSHLWmr, 0 },
// BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
{ X86::BEXTR32rr, X86::BEXTR32rm, 0 },
{ X86::BEXTR64rr, X86::BEXTR64rm, 0 },
{ X86::BEXTRI32ri, X86::BEXTRI32mi, 0 },
{ X86::BEXTRI64ri, X86::BEXTRI64mi, 0 },
{ X86::BLCFILL32rr, X86::BLCFILL32rm, 0 },
{ X86::BLCFILL64rr, X86::BLCFILL64rm, 0 },
{ X86::BLCI32rr, X86::BLCI32rm, 0 },
{ X86::BLCI64rr, X86::BLCI64rm, 0 },
{ X86::BLCIC32rr, X86::BLCIC32rm, 0 },
{ X86::BLCIC64rr, X86::BLCIC64rm, 0 },
{ X86::BLCMSK32rr, X86::BLCMSK32rm, 0 },
{ X86::BLCMSK64rr, X86::BLCMSK64rm, 0 },
{ X86::BLCS32rr, X86::BLCS32rm, 0 },
{ X86::BLCS64rr, X86::BLCS64rm, 0 },
{ X86::BLSFILL32rr, X86::BLSFILL32rm, 0 },
{ X86::BLSFILL64rr, X86::BLSFILL64rm, 0 },
{ X86::BLSI32rr, X86::BLSI32rm, 0 },
{ X86::BLSI64rr, X86::BLSI64rm, 0 },
{ X86::BLSIC32rr, X86::BLSIC32rm, 0 },
{ X86::BLSIC64rr, X86::BLSIC64rm, 0 },
{ X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
{ X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
{ X86::BLSR32rr, X86::BLSR32rm, 0 },
{ X86::BLSR64rr, X86::BLSR64rm, 0 },
{ X86::BZHI32rr, X86::BZHI32rm, 0 },
{ X86::BZHI64rr, X86::BZHI64rm, 0 },
{ X86::LZCNT16rr, X86::LZCNT16rm, 0 },
{ X86::LZCNT32rr, X86::LZCNT32rm, 0 },
{ X86::LZCNT64rr, X86::LZCNT64rm, 0 },
{ X86::POPCNT16rr, X86::POPCNT16rm, 0 },
{ X86::POPCNT32rr, X86::POPCNT32rm, 0 },
{ X86::POPCNT64rr, X86::POPCNT64rm, 0 },
{ X86::RORX32ri, X86::RORX32mi, 0 },
{ X86::RORX64ri, X86::RORX64mi, 0 },
{ X86::SARX32rr, X86::SARX32rm, 0 },
{ X86::SARX64rr, X86::SARX64rm, 0 },
{ X86::SHRX32rr, X86::SHRX32rm, 0 },
{ X86::SHRX64rr, X86::SHRX64rm, 0 },
{ X86::SHLX32rr, X86::SHLX32rm, 0 },
{ X86::SHLX64rr, X86::SHLX64rm, 0 },
{ X86::T1MSKC32rr, X86::T1MSKC32rm, 0 },
{ X86::T1MSKC64rr, X86::T1MSKC64rm, 0 },
{ X86::TZCNT16rr, X86::TZCNT16rm, 0 },
{ X86::TZCNT32rr, X86::TZCNT32rm, 0 },
{ X86::TZCNT64rr, X86::TZCNT64rm, 0 },
{ X86::TZMSK32rr, X86::TZMSK32rm, 0 },
{ X86::TZMSK64rr, X86::TZMSK64rm, 0 },
// AVX-512 foldable instructions
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
{ X86::VBROADCASTSSZr_s, X86::VBROADCASTSSZm, TB_NO_REVERSE },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
{ X86::VBROADCASTSDZr_s, X86::VBROADCASTSDZm, TB_NO_REVERSE },
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
{ X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm, TB_NO_REVERSE },
{ X86::VMOVDI2SSZrr, X86::VMOVDI2SSZrm, 0 },
{ X86::VMOVAPDZrr, X86::VMOVAPDZrm, TB_ALIGN_64 },
{ X86::VMOVAPSZrr, X86::VMOVAPSZrm, TB_ALIGN_64 },
{ X86::VMOVDQA32Zrr, X86::VMOVDQA32Zrm, TB_ALIGN_64 },
{ X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrm, TB_ALIGN_64 },
{ X86::VMOVDQU8Zrr, X86::VMOVDQU8Zrm, 0 },
{ X86::VMOVDQU16Zrr, X86::VMOVDQU16Zrm, 0 },
{ X86::VMOVDQU32Zrr, X86::VMOVDQU32Zrm, 0 },
{ X86::VMOVDQU64Zrr, X86::VMOVDQU64Zrm, 0 },
{ X86::VMOVUPDZrr, X86::VMOVUPDZrm, 0 },
{ X86::VMOVUPSZrr, X86::VMOVUPSZrm, 0 },
{ X86::VPABSDZrr, X86::VPABSDZrm, 0 },
{ X86::VPABSQZrr, X86::VPABSQZrm, 0 },
{ X86::VPERMILPDZri, X86::VPERMILPDZmi, 0 },
{ X86::VPERMILPSZri, X86::VPERMILPSZmi, 0 },
{ X86::VPERMPDZri, X86::VPERMPDZmi, 0 },
{ X86::VPERMQZri, X86::VPERMQZmi, 0 },
{ X86::VPMOVSXBDZrr, X86::VPMOVSXBDZrm, 0 },
{ X86::VPMOVSXBQZrr, X86::VPMOVSXBQZrm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrr, X86::VPMOVSXBWZrm, 0 },
{ X86::VPMOVSXDQZrr, X86::VPMOVSXDQZrm, 0 },
{ X86::VPMOVSXWDZrr, X86::VPMOVSXWDZrm, 0 },
{ X86::VPMOVSXWQZrr, X86::VPMOVSXWQZrm, 0 },
{ X86::VPMOVZXBDZrr, X86::VPMOVZXBDZrm, 0 },
{ X86::VPMOVZXBQZrr, X86::VPMOVZXBQZrm, TB_NO_REVERSE },
{ X86::VPMOVZXBWZrr, X86::VPMOVZXBWZrm, 0 },
{ X86::VPMOVZXDQZrr, X86::VPMOVZXDQZrm, 0 },
{ X86::VPMOVZXWDZrr, X86::VPMOVZXWDZrm, 0 },
{ X86::VPMOVZXWQZrr, X86::VPMOVZXWQZrm, 0 },
{ X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
{ X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
{ X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
// AVX-512 foldable instructions (256-bit versions)
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ256r_s, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256r_s, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
{ X86::VMOVDQA64Z256rr, X86::VMOVDQA64Z256rm, TB_ALIGN_32 },
{ X86::VMOVDQU8Z256rr, X86::VMOVDQU8Z256rm, 0 },
{ X86::VMOVDQU16Z256rr, X86::VMOVDQU16Z256rm, 0 },
{ X86::VMOVDQU32Z256rr, X86::VMOVDQU32Z256rm, 0 },
{ X86::VMOVDQU64Z256rr, X86::VMOVDQU64Z256rm, 0 },
{ X86::VMOVUPDZ256rr, X86::VMOVUPDZ256rm, 0 },
{ X86::VMOVUPSZ256rr, X86::VMOVUPSZ256rm, 0 },
{ X86::VPERMILPDZ256ri, X86::VPERMILPDZ256mi, 0 },
{ X86::VPERMILPSZ256ri, X86::VPERMILPSZ256mi, 0 },
{ X86::VPERMPDZ256ri, X86::VPERMPDZ256mi, 0 },
{ X86::VPERMQZ256ri, X86::VPERMQZ256mi, 0 },
{ X86::VPMOVSXBDZ256rr, X86::VPMOVSXBDZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rr, X86::VPMOVSXBQZ256rm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rr, X86::VPMOVSXBWZ256rm, 0 },
{ X86::VPMOVSXDQZ256rr, X86::VPMOVSXDQZ256rm, 0 },
{ X86::VPMOVSXWDZ256rr, X86::VPMOVSXWDZ256rm, 0 },
{ X86::VPMOVSXWQZ256rr, X86::VPMOVSXWQZ256rm, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ256rr, X86::VPMOVZXBDZ256rm, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ256rr, X86::VPMOVZXBQZ256rm, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ256rr, X86::VPMOVZXBWZ256rm, 0 },
{ X86::VPMOVZXDQZ256rr, X86::VPMOVZXDQZ256rm, 0 },
{ X86::VPMOVZXWDZ256rr, X86::VPMOVZXWDZ256rm, 0 },
{ X86::VPMOVZXWQZ256rr, X86::VPMOVZXWQZ256rm, TB_NO_REVERSE },
{ X86::VPSHUFDZ256ri, X86::VPSHUFDZ256mi, 0 },
{ X86::VPSHUFHWZ256ri, X86::VPSHUFHWZ256mi, 0 },
{ X86::VPSHUFLWZ256ri, X86::VPSHUFLWZ256mi, 0 },
// AVX-512 foldable instructions (128-bit versions)
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ128r_s, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
{ X86::VMOVDQA64Z128rr, X86::VMOVDQA64Z128rm, TB_ALIGN_16 },
{ X86::VMOVDQU8Z128rr, X86::VMOVDQU8Z128rm, 0 },
{ X86::VMOVDQU16Z128rr, X86::VMOVDQU16Z128rm, 0 },
{ X86::VMOVDQU32Z128rr, X86::VMOVDQU32Z128rm, 0 },
{ X86::VMOVDQU64Z128rr, X86::VMOVDQU64Z128rm, 0 },
{ X86::VMOVUPDZ128rr, X86::VMOVUPDZ128rm, 0 },
{ X86::VMOVUPSZ128rr, X86::VMOVUPSZ128rm, 0 },
{ X86::VPERMILPDZ128ri, X86::VPERMILPDZ128mi, 0 },
{ X86::VPERMILPSZ128ri, X86::VPERMILPSZ128mi, 0 },
{ X86::VPMOVSXBDZ128rr, X86::VPMOVSXBDZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rr, X86::VPMOVSXBQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rr, X86::VPMOVSXBWZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXDQZ128rr, X86::VPMOVSXDQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXWDZ128rr, X86::VPMOVSXWDZ128rm, TB_NO_REVERSE },
{ X86::VPMOVSXWQZ128rr, X86::VPMOVSXWQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ128rr, X86::VPMOVZXBDZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ128rr, X86::VPMOVZXBQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ128rr, X86::VPMOVZXBWZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXDQZ128rr, X86::VPMOVZXDQZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXWDZ128rr, X86::VPMOVZXWDZ128rm, TB_NO_REVERSE },
{ X86::VPMOVZXWQZ128rr, X86::VPMOVZXWQZ128rm, TB_NO_REVERSE },
{ X86::VPSHUFDZ128ri, X86::VPSHUFDZ128mi, 0 },
{ X86::VPSHUFHWZ128ri, X86::VPSHUFHWZ128mi, 0 },
{ X86::VPSHUFLWZ128ri, X86::VPSHUFLWZ128mi, 0 },
// F16C foldable instructions
{ X86::VCVTPH2PSrr, X86::VCVTPH2PSrm, 0 },
{ X86::VCVTPH2PSYrr, X86::VCVTPH2PSYrm, 0 },
// AES foldable instructions
{ X86::AESIMCrr, X86::AESIMCrm, TB_ALIGN_16 },
{ X86::AESKEYGENASSIST128rr, X86::AESKEYGENASSIST128rm, TB_ALIGN_16 },
{ X86::VAESIMCrr, X86::VAESIMCrm, 0 },
{ X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 1, folded load
Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
{ X86::ADC32rr, X86::ADC32rm, 0 },
{ X86::ADC64rr, X86::ADC64rm, 0 },
{ X86::ADD16rr, X86::ADD16rm, 0 },
{ X86::ADD16rr_DB, X86::ADD16rm, TB_NO_REVERSE },
{ X86::ADD32rr, X86::ADD32rm, 0 },
{ X86::ADD32rr_DB, X86::ADD32rm, TB_NO_REVERSE },
{ X86::ADD64rr, X86::ADD64rm, 0 },
{ X86::ADD64rr_DB, X86::ADD64rm, TB_NO_REVERSE },
{ X86::ADD8rr, X86::ADD8rm, 0 },
{ X86::ADDPDrr, X86::ADDPDrm, TB_ALIGN_16 },
{ X86::ADDPSrr, X86::ADDPSrm, TB_ALIGN_16 },
{ X86::ADDSDrr, X86::ADDSDrm, 0 },
{ X86::ADDSDrr_Int, X86::ADDSDrm_Int, TB_NO_REVERSE },
{ X86::ADDSSrr, X86::ADDSSrm, 0 },
{ X86::ADDSSrr_Int, X86::ADDSSrm_Int, TB_NO_REVERSE },
{ X86::ADDSUBPDrr, X86::ADDSUBPDrm, TB_ALIGN_16 },
{ X86::ADDSUBPSrr, X86::ADDSUBPSrm, TB_ALIGN_16 },
{ X86::AND16rr, X86::AND16rm, 0 },
{ X86::AND32rr, X86::AND32rm, 0 },
{ X86::AND64rr, X86::AND64rm, 0 },
{ X86::AND8rr, X86::AND8rm, 0 },
{ X86::ANDNPDrr, X86::ANDNPDrm, TB_ALIGN_16 },
{ X86::ANDNPSrr, X86::ANDNPSrm, TB_ALIGN_16 },
{ X86::ANDPDrr, X86::ANDPDrm, TB_ALIGN_16 },
{ X86::ANDPSrr, X86::ANDPSrm, TB_ALIGN_16 },
{ X86::BLENDPDrri, X86::BLENDPDrmi, TB_ALIGN_16 },
{ X86::BLENDPSrri, X86::BLENDPSrmi, TB_ALIGN_16 },
{ X86::BLENDVPDrr0, X86::BLENDVPDrm0, TB_ALIGN_16 },
{ X86::BLENDVPSrr0, X86::BLENDVPSrm0, TB_ALIGN_16 },
{ X86::CMOVA16rr, X86::CMOVA16rm, 0 },
{ X86::CMOVA32rr, X86::CMOVA32rm, 0 },
{ X86::CMOVA64rr, X86::CMOVA64rm, 0 },
{ X86::CMOVAE16rr, X86::CMOVAE16rm, 0 },
{ X86::CMOVAE32rr, X86::CMOVAE32rm, 0 },
{ X86::CMOVAE64rr, X86::CMOVAE64rm, 0 },
{ X86::CMOVB16rr, X86::CMOVB16rm, 0 },
{ X86::CMOVB32rr, X86::CMOVB32rm, 0 },
{ X86::CMOVB64rr, X86::CMOVB64rm, 0 },
{ X86::CMOVBE16rr, X86::CMOVBE16rm, 0 },
{ X86::CMOVBE32rr, X86::CMOVBE32rm, 0 },
{ X86::CMOVBE64rr, X86::CMOVBE64rm, 0 },
{ X86::CMOVE16rr, X86::CMOVE16rm, 0 },
{ X86::CMOVE32rr, X86::CMOVE32rm, 0 },
{ X86::CMOVE64rr, X86::CMOVE64rm, 0 },
{ X86::CMOVG16rr, X86::CMOVG16rm, 0 },
{ X86::CMOVG32rr, X86::CMOVG32rm, 0 },
{ X86::CMOVG64rr, X86::CMOVG64rm, 0 },
{ X86::CMOVGE16rr, X86::CMOVGE16rm, 0 },
{ X86::CMOVGE32rr, X86::CMOVGE32rm, 0 },
{ X86::CMOVGE64rr, X86::CMOVGE64rm, 0 },
{ X86::CMOVL16rr, X86::CMOVL16rm, 0 },
{ X86::CMOVL32rr, X86::CMOVL32rm, 0 },
{ X86::CMOVL64rr, X86::CMOVL64rm, 0 },
{ X86::CMOVLE16rr, X86::CMOVLE16rm, 0 },
{ X86::CMOVLE32rr, X86::CMOVLE32rm, 0 },
{ X86::CMOVLE64rr, X86::CMOVLE64rm, 0 },
{ X86::CMOVNE16rr, X86::CMOVNE16rm, 0 },
{ X86::CMOVNE32rr, X86::CMOVNE32rm, 0 },
{ X86::CMOVNE64rr, X86::CMOVNE64rm, 0 },
{ X86::CMOVNO16rr, X86::CMOVNO16rm, 0 },
{ X86::CMOVNO32rr, X86::CMOVNO32rm, 0 },
{ X86::CMOVNO64rr, X86::CMOVNO64rm, 0 },
{ X86::CMOVNP16rr, X86::CMOVNP16rm, 0 },
{ X86::CMOVNP32rr, X86::CMOVNP32rm, 0 },
{ X86::CMOVNP64rr, X86::CMOVNP64rm, 0 },
{ X86::CMOVNS16rr, X86::CMOVNS16rm, 0 },
{ X86::CMOVNS32rr, X86::CMOVNS32rm, 0 },
{ X86::CMOVNS64rr, X86::CMOVNS64rm, 0 },
{ X86::CMOVO16rr, X86::CMOVO16rm, 0 },
{ X86::CMOVO32rr, X86::CMOVO32rm, 0 },
{ X86::CMOVO64rr, X86::CMOVO64rm, 0 },
{ X86::CMOVP16rr, X86::CMOVP16rm, 0 },
{ X86::CMOVP32rr, X86::CMOVP32rm, 0 },
{ X86::CMOVP64rr, X86::CMOVP64rm, 0 },
{ X86::CMOVS16rr, X86::CMOVS16rm, 0 },
{ X86::CMOVS32rr, X86::CMOVS32rm, 0 },
{ X86::CMOVS64rr, X86::CMOVS64rm, 0 },
{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
{ X86::CMPSSrr, X86::CMPSSrm, 0 },
{ X86::CRC32r32r32, X86::CRC32r32m32, 0 },
{ X86::CRC32r64r64, X86::CRC32r64m64, 0 },
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
{ X86::DIVSDrr_Int, X86::DIVSDrm_Int, TB_NO_REVERSE },
{ X86::DIVSSrr, X86::DIVSSrm, 0 },
{ X86::DIVSSrr_Int, X86::DIVSSrm_Int, TB_NO_REVERSE },
{ X86::DPPDrri, X86::DPPDrmi, TB_ALIGN_16 },
{ X86::DPPSrri, X86::DPPSrmi, TB_ALIGN_16 },
{ X86::HADDPDrr, X86::HADDPDrm, TB_ALIGN_16 },
{ X86::HADDPSrr, X86::HADDPSrm, TB_ALIGN_16 },
{ X86::HSUBPDrr, X86::HSUBPDrm, TB_ALIGN_16 },
{ X86::HSUBPSrr, X86::HSUBPSrm, TB_ALIGN_16 },
{ X86::IMUL16rr, X86::IMUL16rm, 0 },
{ X86::IMUL32rr, X86::IMUL32rm, 0 },
{ X86::IMUL64rr, X86::IMUL64rm, 0 },
{ X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
{ X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
{ X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
{ X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
{ X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
{ X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
{ X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
{ X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
{ X86::MAXCPSrr, X86::MAXCPSrm, TB_ALIGN_16 },
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
{ X86::MAXCSDrr, X86::MAXCSDrm, 0 },
{ X86::MAXSDrr_Int, X86::MAXSDrm_Int, TB_NO_REVERSE },
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
{ X86::MAXCSSrr, X86::MAXCSSrm, 0 },
{ X86::MAXSSrr_Int, X86::MAXSSrm_Int, TB_NO_REVERSE },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
{ X86::MINCPDrr, X86::MINCPDrm, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
{ X86::MINCPSrr, X86::MINCPSrm, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },
{ X86::MINCSDrr, X86::MINCSDrm, 0 },
{ X86::MINSDrr_Int, X86::MINSDrm_Int, TB_NO_REVERSE },
{ X86::MINSSrr, X86::MINSSrm, 0 },
{ X86::MINCSSrr, X86::MINCSSrm, 0 },
{ X86::MINSSrr_Int, X86::MINSSrm_Int, TB_NO_REVERSE },
{ X86::MOVLHPSrr, X86::MOVHPSrm, TB_NO_REVERSE },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
{ X86::MULSDrr, X86::MULSDrm, 0 },
{ X86::MULSDrr_Int, X86::MULSDrm_Int, TB_NO_REVERSE },
{ X86::MULSSrr, X86::MULSSrm, 0 },
{ X86::MULSSrr_Int, X86::MULSSrm_Int, TB_NO_REVERSE },
{ X86::OR16rr, X86::OR16rm, 0 },
{ X86::OR32rr, X86::OR32rm, 0 },
{ X86::OR64rr, X86::OR64rm, 0 },
{ X86::OR8rr, X86::OR8rm, 0 },
{ X86::ORPDrr, X86::ORPDrm, TB_ALIGN_16 },
{ X86::ORPSrr, X86::ORPSrm, TB_ALIGN_16 },
{ X86::PACKSSDWrr, X86::PACKSSDWrm, TB_ALIGN_16 },
{ X86::PACKSSWBrr, X86::PACKSSWBrm, TB_ALIGN_16 },
{ X86::PACKUSDWrr, X86::PACKUSDWrm, TB_ALIGN_16 },
{ X86::PACKUSWBrr, X86::PACKUSWBrm, TB_ALIGN_16 },
{ X86::PADDBrr, X86::PADDBrm, TB_ALIGN_16 },
{ X86::PADDDrr, X86::PADDDrm, TB_ALIGN_16 },
{ X86::PADDQrr, X86::PADDQrm, TB_ALIGN_16 },
{ X86::PADDSBrr, X86::PADDSBrm, TB_ALIGN_16 },
{ X86::PADDSWrr, X86::PADDSWrm, TB_ALIGN_16 },
{ X86::PADDUSBrr, X86::PADDUSBrm, TB_ALIGN_16 },
{ X86::PADDUSWrr, X86::PADDUSWrm, TB_ALIGN_16 },
{ X86::PADDWrr, X86::PADDWrm, TB_ALIGN_16 },
{ X86::PALIGNRrri, X86::PALIGNRrmi, TB_ALIGN_16 },
{ X86::PANDNrr, X86::PANDNrm, TB_ALIGN_16 },
{ X86::PANDrr, X86::PANDrm, TB_ALIGN_16 },
{ X86::PAVGBrr, X86::PAVGBrm, TB_ALIGN_16 },
{ X86::PAVGWrr, X86::PAVGWrm, TB_ALIGN_16 },
{ X86::PBLENDVBrr0, X86::PBLENDVBrm0, TB_ALIGN_16 },
{ X86::PBLENDWrri, X86::PBLENDWrmi, TB_ALIGN_16 },
{ X86::PCLMULQDQrr, X86::PCLMULQDQrm, TB_ALIGN_16 },
{ X86::PCMPEQBrr, X86::PCMPEQBrm, TB_ALIGN_16 },
{ X86::PCMPEQDrr, X86::PCMPEQDrm, TB_ALIGN_16 },
{ X86::PCMPEQQrr, X86::PCMPEQQrm, TB_ALIGN_16 },
{ X86::PCMPEQWrr, X86::PCMPEQWrm, TB_ALIGN_16 },
{ X86::PCMPGTBrr, X86::PCMPGTBrm, TB_ALIGN_16 },
{ X86::PCMPGTDrr, X86::PCMPGTDrm, TB_ALIGN_16 },
{ X86::PCMPGTQrr, X86::PCMPGTQrm, TB_ALIGN_16 },
{ X86::PCMPGTWrr, X86::PCMPGTWrm, TB_ALIGN_16 },
{ X86::PHADDDrr, X86::PHADDDrm, TB_ALIGN_16 },
{ X86::PHADDWrr, X86::PHADDWrm, TB_ALIGN_16 },
{ X86::PHADDSWrr128, X86::PHADDSWrm128, TB_ALIGN_16 },
{ X86::PHSUBDrr, X86::PHSUBDrm, TB_ALIGN_16 },
{ X86::PHSUBSWrr128, X86::PHSUBSWrm128, TB_ALIGN_16 },
{ X86::PHSUBWrr, X86::PHSUBWrm, TB_ALIGN_16 },
{ X86::PINSRBrr, X86::PINSRBrm, 0 },
{ X86::PINSRDrr, X86::PINSRDrm, 0 },
{ X86::PINSRQrr, X86::PINSRQrm, 0 },
{ X86::PINSRWrri, X86::PINSRWrmi, 0 },
{ X86::PMADDUBSWrr, X86::PMADDUBSWrm, TB_ALIGN_16 },
{ X86::PMADDWDrr, X86::PMADDWDrm, TB_ALIGN_16 },
{ X86::PMAXSWrr, X86::PMAXSWrm, TB_ALIGN_16 },
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
{ X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
{ X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
{ X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
{ X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
{ X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
{ X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
{ X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
{ X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
{ X86::PMULHRSWrr, X86::PMULHRSWrm, TB_ALIGN_16 },
{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
{ X86::PMULHWrr, X86::PMULHWrm, TB_ALIGN_16 },
{ X86::PMULLDrr, X86::PMULLDrm, TB_ALIGN_16 },
{ X86::PMULLWrr, X86::PMULLWrm, TB_ALIGN_16 },
{ X86::PMULUDQrr, X86::PMULUDQrm, TB_ALIGN_16 },
{ X86::PORrr, X86::PORrm, TB_ALIGN_16 },
{ X86::PSADBWrr, X86::PSADBWrm, TB_ALIGN_16 },
{ X86::PSHUFBrr, X86::PSHUFBrm, TB_ALIGN_16 },
{ X86::PSIGNBrr128, X86::PSIGNBrm128, TB_ALIGN_16 },
{ X86::PSIGNWrr128, X86::PSIGNWrm128, TB_ALIGN_16 },
{ X86::PSIGNDrr128, X86::PSIGNDrm128, TB_ALIGN_16 },
{ X86::PSLLDrr, X86::PSLLDrm, TB_ALIGN_16 },
{ X86::PSLLQrr, X86::PSLLQrm, TB_ALIGN_16 },
{ X86::PSLLWrr, X86::PSLLWrm, TB_ALIGN_16 },
{ X86::PSRADrr, X86::PSRADrm, TB_ALIGN_16 },
{ X86::PSRAWrr, X86::PSRAWrm, TB_ALIGN_16 },
{ X86::PSRLDrr, X86::PSRLDrm, TB_ALIGN_16 },
{ X86::PSRLQrr, X86::PSRLQrm, TB_ALIGN_16 },
{ X86::PSRLWrr, X86::PSRLWrm, TB_ALIGN_16 },
{ X86::PSUBBrr, X86::PSUBBrm, TB_ALIGN_16 },
{ X86::PSUBDrr, X86::PSUBDrm, TB_ALIGN_16 },
{ X86::PSUBQrr, X86::PSUBQrm, TB_ALIGN_16 },
{ X86::PSUBSBrr, X86::PSUBSBrm, TB_ALIGN_16 },
{ X86::PSUBSWrr, X86::PSUBSWrm, TB_ALIGN_16 },
{ X86::PSUBUSBrr, X86::PSUBUSBrm, TB_ALIGN_16 },
{ X86::PSUBUSWrr, X86::PSUBUSWrm, TB_ALIGN_16 },
{ X86::PSUBWrr, X86::PSUBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHBWrr, X86::PUNPCKHBWrm, TB_ALIGN_16 },
{ X86::PUNPCKHDQrr, X86::PUNPCKHDQrm, TB_ALIGN_16 },
{ X86::PUNPCKHQDQrr, X86::PUNPCKHQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKHWDrr, X86::PUNPCKHWDrm, TB_ALIGN_16 },
{ X86::PUNPCKLBWrr, X86::PUNPCKLBWrm, TB_ALIGN_16 },
{ X86::PUNPCKLDQrr, X86::PUNPCKLDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLQDQrr, X86::PUNPCKLQDQrm, TB_ALIGN_16 },
{ X86::PUNPCKLWDrr, X86::PUNPCKLWDrm, TB_ALIGN_16 },
{ X86::PXORrr, X86::PXORrm, TB_ALIGN_16 },
{ X86::ROUNDSDr_Int, X86::ROUNDSDm_Int, TB_NO_REVERSE },
{ X86::ROUNDSSr_Int, X86::ROUNDSSm_Int, TB_NO_REVERSE },
{ X86::SBB32rr, X86::SBB32rm, 0 },
{ X86::SBB64rr, X86::SBB64rm, 0 },
{ X86::SHUFPDrri, X86::SHUFPDrmi, TB_ALIGN_16 },
{ X86::SHUFPSrri, X86::SHUFPSrmi, TB_ALIGN_16 },
{ X86::SUB16rr, X86::SUB16rm, 0 },
{ X86::SUB32rr, X86::SUB32rm, 0 },
{ X86::SUB64rr, X86::SUB64rm, 0 },
{ X86::SUB8rr, X86::SUB8rm, 0 },
{ X86::SUBPDrr, X86::SUBPDrm, TB_ALIGN_16 },
{ X86::SUBPSrr, X86::SUBPSrm, TB_ALIGN_16 },
{ X86::SUBSDrr, X86::SUBSDrm, 0 },
{ X86::SUBSDrr_Int, X86::SUBSDrm_Int, TB_NO_REVERSE },
{ X86::SUBSSrr, X86::SUBSSrm, 0 },
{ X86::SUBSSrr_Int, X86::SUBSSrm_Int, TB_NO_REVERSE },
// FIXME: TEST*rr -> swapped operand of TEST*mr.
{ X86::UNPCKHPDrr, X86::UNPCKHPDrm, TB_ALIGN_16 },
{ X86::UNPCKHPSrr, X86::UNPCKHPSrm, TB_ALIGN_16 },
{ X86::UNPCKLPDrr, X86::UNPCKLPDrm, TB_ALIGN_16 },
{ X86::UNPCKLPSrr, X86::UNPCKLPSrm, TB_ALIGN_16 },
{ X86::XOR16rr, X86::XOR16rm, 0 },
{ X86::XOR32rr, X86::XOR32rm, 0 },
{ X86::XOR64rr, X86::XOR64rm, 0 },
{ X86::XOR8rr, X86::XOR8rm, 0 },
{ X86::XORPDrr, X86::XORPDrm, TB_ALIGN_16 },
{ X86::XORPSrr, X86::XORPSrm, TB_ALIGN_16 },
// MMX version of foldable instructions
{ X86::MMX_CVTPI2PSirr, X86::MMX_CVTPI2PSirm, 0 },
{ X86::MMX_PACKSSDWirr, X86::MMX_PACKSSDWirm, 0 },
{ X86::MMX_PACKSSWBirr, X86::MMX_PACKSSWBirm, 0 },
{ X86::MMX_PACKUSWBirr, X86::MMX_PACKUSWBirm, 0 },
{ X86::MMX_PADDBirr, X86::MMX_PADDBirm, 0 },
{ X86::MMX_PADDDirr, X86::MMX_PADDDirm, 0 },
{ X86::MMX_PADDQirr, X86::MMX_PADDQirm, 0 },
{ X86::MMX_PADDSBirr, X86::MMX_PADDSBirm, 0 },
{ X86::MMX_PADDSWirr, X86::MMX_PADDSWirm, 0 },
{ X86::MMX_PADDUSBirr, X86::MMX_PADDUSBirm, 0 },
{ X86::MMX_PADDUSWirr, X86::MMX_PADDUSWirm, 0 },
{ X86::MMX_PADDWirr, X86::MMX_PADDWirm, 0 },
{ X86::MMX_PALIGNR64irr, X86::MMX_PALIGNR64irm, 0 },
{ X86::MMX_PANDNirr, X86::MMX_PANDNirm, 0 },
{ X86::MMX_PANDirr, X86::MMX_PANDirm, 0 },
{ X86::MMX_PAVGBirr, X86::MMX_PAVGBirm, 0 },
{ X86::MMX_PAVGWirr, X86::MMX_PAVGWirm, 0 },
{ X86::MMX_PCMPEQBirr, X86::MMX_PCMPEQBirm, 0 },
{ X86::MMX_PCMPEQDirr, X86::MMX_PCMPEQDirm, 0 },
{ X86::MMX_PCMPEQWirr, X86::MMX_PCMPEQWirm, 0 },
{ X86::MMX_PCMPGTBirr, X86::MMX_PCMPGTBirm, 0 },
{ X86::MMX_PCMPGTDirr, X86::MMX_PCMPGTDirm, 0 },
{ X86::MMX_PCMPGTWirr, X86::MMX_PCMPGTWirm, 0 },
{ X86::MMX_PHADDSWrr64, X86::MMX_PHADDSWrm64, 0 },
{ X86::MMX_PHADDWrr64, X86::MMX_PHADDWrm64, 0 },
{ X86::MMX_PHADDrr64, X86::MMX_PHADDrm64, 0 },
{ X86::MMX_PHSUBDrr64, X86::MMX_PHSUBDrm64, 0 },
{ X86::MMX_PHSUBSWrr64, X86::MMX_PHSUBSWrm64, 0 },
{ X86::MMX_PHSUBWrr64, X86::MMX_PHSUBWrm64, 0 },
{ X86::MMX_PINSRWirri, X86::MMX_PINSRWirmi, 0 },
{ X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
{ X86::MMX_PMADDWDirr, X86::MMX_PMADDWDirm, 0 },
{ X86::MMX_PMAXSWirr, X86::MMX_PMAXSWirm, 0 },
{ X86::MMX_PMAXUBirr, X86::MMX_PMAXUBirm, 0 },
{ X86::MMX_PMINSWirr, X86::MMX_PMINSWirm, 0 },
{ X86::MMX_PMINUBirr, X86::MMX_PMINUBirm, 0 },
{ X86::MMX_PMULHRSWrr64, X86::MMX_PMULHRSWrm64, 0 },
{ X86::MMX_PMULHUWirr, X86::MMX_PMULHUWirm, 0 },
{ X86::MMX_PMULHWirr, X86::MMX_PMULHWirm, 0 },
{ X86::MMX_PMULLWirr, X86::MMX_PMULLWirm, 0 },
{ X86::MMX_PMULUDQirr, X86::MMX_PMULUDQirm, 0 },
{ X86::MMX_PORirr, X86::MMX_PORirm, 0 },
{ X86::MMX_PSADBWirr, X86::MMX_PSADBWirm, 0 },
{ X86::MMX_PSHUFBrr64, X86::MMX_PSHUFBrm64, 0 },
{ X86::MMX_PSIGNBrr64, X86::MMX_PSIGNBrm64, 0 },
{ X86::MMX_PSIGNDrr64, X86::MMX_PSIGNDrm64, 0 },
{ X86::MMX_PSIGNWrr64, X86::MMX_PSIGNWrm64, 0 },
{ X86::MMX_PSLLDrr, X86::MMX_PSLLDrm, 0 },
{ X86::MMX_PSLLQrr, X86::MMX_PSLLQrm, 0 },
{ X86::MMX_PSLLWrr, X86::MMX_PSLLWrm, 0 },
{ X86::MMX_PSRADrr, X86::MMX_PSRADrm, 0 },
{ X86::MMX_PSRAWrr, X86::MMX_PSRAWrm, 0 },
{ X86::MMX_PSRLDrr, X86::MMX_PSRLDrm, 0 },
{ X86::MMX_PSRLQrr, X86::MMX_PSRLQrm, 0 },
{ X86::MMX_PSRLWrr, X86::MMX_PSRLWrm, 0 },
{ X86::MMX_PSUBBirr, X86::MMX_PSUBBirm, 0 },
{ X86::MMX_PSUBDirr, X86::MMX_PSUBDirm, 0 },
{ X86::MMX_PSUBQirr, X86::MMX_PSUBQirm, 0 },
{ X86::MMX_PSUBSBirr, X86::MMX_PSUBSBirm, 0 },
{ X86::MMX_PSUBSWirr, X86::MMX_PSUBSWirm, 0 },
{ X86::MMX_PSUBUSBirr, X86::MMX_PSUBUSBirm, 0 },
{ X86::MMX_PSUBUSWirr, X86::MMX_PSUBUSWirm, 0 },
{ X86::MMX_PSUBWirr, X86::MMX_PSUBWirm, 0 },
{ X86::MMX_PUNPCKHBWirr, X86::MMX_PUNPCKHBWirm, 0 },
{ X86::MMX_PUNPCKHDQirr, X86::MMX_PUNPCKHDQirm, 0 },
{ X86::MMX_PUNPCKHWDirr, X86::MMX_PUNPCKHWDirm, 0 },
{ X86::MMX_PUNPCKLBWirr, X86::MMX_PUNPCKLBWirm, 0 },
{ X86::MMX_PUNPCKLDQirr, X86::MMX_PUNPCKLDQirm, 0 },
{ X86::MMX_PUNPCKLWDirr, X86::MMX_PUNPCKLWDirm, 0 },
{ X86::MMX_PXORirr, X86::MMX_PXORirm, 0 },
// 3DNow! version of foldable instructions
{ X86::PAVGUSBrr, X86::PAVGUSBrm, 0 },
{ X86::PFACCrr, X86::PFACCrm, 0 },
{ X86::PFADDrr, X86::PFADDrm, 0 },
{ X86::PFCMPEQrr, X86::PFCMPEQrm, 0 },
{ X86::PFCMPGErr, X86::PFCMPGErm, 0 },
{ X86::PFCMPGTrr, X86::PFCMPGTrm, 0 },
{ X86::PFMAXrr, X86::PFMAXrm, 0 },
{ X86::PFMINrr, X86::PFMINrm, 0 },
{ X86::PFMULrr, X86::PFMULrm, 0 },
{ X86::PFNACCrr, X86::PFNACCrm, 0 },
{ X86::PFPNACCrr, X86::PFPNACCrm, 0 },
{ X86::PFRCPIT1rr, X86::PFRCPIT1rm, 0 },
{ X86::PFRCPIT2rr, X86::PFRCPIT2rm, 0 },
{ X86::PFRSQIT1rr, X86::PFRSQIT1rm, 0 },
{ X86::PFSUBrr, X86::PFSUBrm, 0 },
{ X86::PFSUBRrr, X86::PFSUBRrm, 0 },
{ X86::PMULHRWrr, X86::PMULHRWrm, 0 },
// AVX 128-bit versions of foldable instructions
{ X86::VCVTSD2SSrr, X86::VCVTSD2SSrm, 0 },
{ X86::Int_VCVTSD2SSrr, X86::Int_VCVTSD2SSrm, TB_NO_REVERSE },
{ X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
{ X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
{ X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
{ X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
{ X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, TB_NO_REVERSE },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
{ X86::VADDSDrr_Int, X86::VADDSDrm_Int, TB_NO_REVERSE },
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
{ X86::VADDSSrr_Int, X86::VADDSSrm_Int, TB_NO_REVERSE },
{ X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
{ X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
{ X86::VANDNPDrr, X86::VANDNPDrm, 0 },
{ X86::VANDNPSrr, X86::VANDNPSrm, 0 },
{ X86::VANDPDrr, X86::VANDPDrm, 0 },
{ X86::VANDPSrr, X86::VANDPSrm, 0 },
{ X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
{ X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
{ X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
{ X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
{ X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
{ X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
{ X86::VCMPSSrr, X86::VCMPSSrm, 0 },
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
{ X86::VDIVSDrr_Int, X86::VDIVSDrm_Int, TB_NO_REVERSE },
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
{ X86::VDIVSSrr_Int, X86::VDIVSSrm_Int, TB_NO_REVERSE },
{ X86::VDPPDrri, X86::VDPPDrmi, 0 },
{ X86::VDPPSrri, X86::VDPPSrmi, 0 },
{ X86::VHADDPDrr, X86::VHADDPDrm, 0 },
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
{ X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
{ X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
{ X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
{ X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
{ X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
{ X86::VMAXCSSrr, X86::VMAXCSSrm, 0 },
{ X86::VMAXPDrr, X86::VMAXPDrm, 0 },
{ X86::VMAXPSrr, X86::VMAXPSrm, 0 },
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
{ X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, TB_NO_REVERSE },
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
{ X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, TB_NO_REVERSE },
{ X86::VMINCPDrr, X86::VMINCPDrm, 0 },
{ X86::VMINCPSrr, X86::VMINCPSrm, 0 },
{ X86::VMINCSDrr, X86::VMINCSDrm, 0 },
{ X86::VMINCSSrr, X86::VMINCSSrm, 0 },
{ X86::VMINPDrr, X86::VMINPDrm, 0 },
{ X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
{ X86::VMINSDrr_Int, X86::VMINSDrm_Int, TB_NO_REVERSE },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
{ X86::VMINSSrr_Int, X86::VMINSSrm_Int, TB_NO_REVERSE },
{ X86::VMOVLHPSrr, X86::VMOVHPSrm, TB_NO_REVERSE },
{ X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
{ X86::VMULPDrr, X86::VMULPDrm, 0 },
{ X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
{ X86::VMULSDrr_Int, X86::VMULSDrm_Int, TB_NO_REVERSE },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
{ X86::VMULSSrr_Int, X86::VMULSSrm_Int, TB_NO_REVERSE },
{ X86::VORPDrr, X86::VORPDrm, 0 },
{ X86::VORPSrr, X86::VORPSrm, 0 },
{ X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
{ X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
{ X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
{ X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
{ X86::VPADDBrr, X86::VPADDBrm, 0 },
{ X86::VPADDDrr, X86::VPADDDrm, 0 },
{ X86::VPADDQrr, X86::VPADDQrm, 0 },
{ X86::VPADDSBrr, X86::VPADDSBrm, 0 },
{ X86::VPADDSWrr, X86::VPADDSWrm, 0 },
{ X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
{ X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
{ X86::VPADDWrr, X86::VPADDWrm, 0 },
{ X86::VPALIGNRrri, X86::VPALIGNRrmi, 0 },
{ X86::VPANDNrr, X86::VPANDNrm, 0 },
{ X86::VPANDrr, X86::VPANDrm, 0 },
{ X86::VPAVGBrr, X86::VPAVGBrm, 0 },
{ X86::VPAVGWrr, X86::VPAVGWrm, 0 },
{ X86::VPBLENDVBrr, X86::VPBLENDVBrm, 0 },
{ X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
{ X86::VPCLMULQDQrr, X86::VPCLMULQDQrm, 0 },
{ X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
{ X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
{ X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
{ X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
{ X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
{ X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
{ X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
{ X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
{ X86::VPHADDDrr, X86::VPHADDDrm, 0 },
{ X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
{ X86::VPHADDWrr, X86::VPHADDWrm, 0 },
{ X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
{ X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
{ X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
{ X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
{ X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
{ X86::VPINSRBrr, X86::VPINSRBrm, 0 },
{ X86::VPINSRDrr, X86::VPINSRDrm, 0 },
{ X86::VPINSRQrr, X86::VPINSRQrm, 0 },
{ X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
{ X86::VPMADDUBSWrr, X86::VPMADDUBSWrm, 0 },
{ X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
{ X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
{ X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
{ X86::VPMINSWrr, X86::VPMINSWrm, 0 },
{ X86::VPMINUBrr, X86::VPMINUBrm, 0 },
{ X86::VPMINSBrr, X86::VPMINSBrm, 0 },
{ X86::VPMINSDrr, X86::VPMINSDrm, 0 },
{ X86::VPMINUDrr, X86::VPMINUDrm, 0 },
{ X86::VPMINUWrr, X86::VPMINUWrm, 0 },
{ X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
{ X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
{ X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
{ X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
{ X86::VPMULDQrr, X86::VPMULDQrm, 0 },
{ X86::VPMULHRSWrr, X86::VPMULHRSWrm, 0 },
{ X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
{ X86::VPMULHWrr, X86::VPMULHWrm, 0 },
{ X86::VPMULLDrr, X86::VPMULLDrm, 0 },
{ X86::VPMULLWrr, X86::VPMULLWrm, 0 },
{ X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
{ X86::VPORrr, X86::VPORrm, 0 },
{ X86::VPSADBWrr, X86::VPSADBWrm, 0 },
{ X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
{ X86::VPSIGNBrr128, X86::VPSIGNBrm128, 0 },
{ X86::VPSIGNWrr128, X86::VPSIGNWrm128, 0 },
{ X86::VPSIGNDrr128, X86::VPSIGNDrm128, 0 },
{ X86::VPSLLDrr, X86::VPSLLDrm, 0 },
{ X86::VPSLLQrr, X86::VPSLLQrm, 0 },
{ X86::VPSLLWrr, X86::VPSLLWrm, 0 },
{ X86::VPSRADrr, X86::VPSRADrm, 0 },
{ X86::VPSRAWrr, X86::VPSRAWrm, 0 },
{ X86::VPSRLDrr, X86::VPSRLDrm, 0 },
{ X86::VPSRLQrr, X86::VPSRLQrm, 0 },
{ X86::VPSRLWrr, X86::VPSRLWrm, 0 },
{ X86::VPSUBBrr, X86::VPSUBBrm, 0 },
{ X86::VPSUBDrr, X86::VPSUBDrm, 0 },
{ X86::VPSUBQrr, X86::VPSUBQrm, 0 },
{ X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
{ X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
{ X86::VPSUBUSBrr, X86::VPSUBUSBrm, 0 },
{ X86::VPSUBUSWrr, X86::VPSUBUSWrm, 0 },
{ X86::VPSUBWrr, X86::VPSUBWrm, 0 },
{ X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
{ X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
{ X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
{ X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
{ X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
{ X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
{ X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
{ X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
{ X86::VPXORrr, X86::VPXORrm, 0 },
{ X86::VRCPSSr, X86::VRCPSSm, 0 },
{ X86::VRCPSSr_Int, X86::VRCPSSm_Int, TB_NO_REVERSE },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
{ X86::VRSQRTSSr_Int, X86::VRSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VROUNDSDr, X86::VROUNDSDm, 0 },
{ X86::VROUNDSDr_Int, X86::VROUNDSDm_Int, TB_NO_REVERSE },
{ X86::VROUNDSSr, X86::VROUNDSSm, 0 },
{ X86::VROUNDSSr_Int, X86::VROUNDSSm_Int, TB_NO_REVERSE },
{ X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
{ X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
{ X86::VSQRTSDr_Int, X86::VSQRTSDm_Int, TB_NO_REVERSE },
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
{ X86::VSQRTSSr_Int, X86::VSQRTSSm_Int, TB_NO_REVERSE },
{ X86::VSUBPDrr, X86::VSUBPDrm, 0 },
{ X86::VSUBPSrr, X86::VSUBPSrm, 0 },
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
{ X86::VSUBSDrr_Int, X86::VSUBSDrm_Int, TB_NO_REVERSE },
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
{ X86::VSUBSSrr_Int, X86::VSUBSSrm_Int, TB_NO_REVERSE },
{ X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
{ X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
{ X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
{ X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
{ X86::VXORPDrr, X86::VXORPDrm, 0 },
{ X86::VXORPSrr, X86::VXORPSrm, 0 },
// AVX 256-bit foldable instructions
{ X86::VADDPDYrr, X86::VADDPDYrm, 0 },
{ X86::VADDPSYrr, X86::VADDPSYrm, 0 },
{ X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
{ X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
{ X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
{ X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
{ X86::VANDPDYrr, X86::VANDPDYrm, 0 },
{ X86::VANDPSYrr, X86::VANDPSYrm, 0 },
{ X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
{ X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
{ X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
{ X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
{ X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
{ X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
{ X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
{ X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
{ X86::VDPPSYrri, X86::VDPPSYrmi, 0 },
{ X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
{ X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
{ X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
{ X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
{ X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
{ X86::VMAXCPDYrr, X86::VMAXCPDYrm, 0 },
{ X86::VMAXCPSYrr, X86::VMAXCPSYrm, 0 },
{ X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
{ X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
{ X86::VMINCPDYrr, X86::VMINCPDYrm, 0 },
{ X86::VMINCPSYrr, X86::VMINCPSYrm, 0 },
{ X86::VMINPDYrr, X86::VMINPDYrm, 0 },
{ X86::VMINPSYrr, X86::VMINPSYrm, 0 },
{ X86::VMULPDYrr, X86::VMULPDYrm, 0 },
{ X86::VMULPSYrr, X86::VMULPSYrm, 0 },
{ X86::VORPDYrr, X86::VORPDYrm, 0 },
{ X86::VORPSYrr, X86::VORPSYrm, 0 },
{ X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
{ X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
{ X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
{ X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
{ X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
{ X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
{ X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
{ X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
{ X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
{ X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
{ X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
{ X86::VXORPDYrr, X86::VXORPDYrm, 0 },
{ X86::VXORPSYrr, X86::VXORPSYrm, 0 },
// AVX2 foldable instructions
{ X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
{ X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
{ X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
{ X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
{ X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
{ X86::VPADDBYrr, X86::VPADDBYrm, 0 },
{ X86::VPADDDYrr, X86::VPADDDYrm, 0 },
{ X86::VPADDQYrr, X86::VPADDQYrm, 0 },
{ X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
{ X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
{ X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
{ X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
{ X86::VPADDWYrr, X86::VPADDWYrm, 0 },
{ X86::VPALIGNRYrri, X86::VPALIGNRYrmi, 0 },
{ X86::VPANDNYrr, X86::VPANDNYrm, 0 },
{ X86::VPANDYrr, X86::VPANDYrm, 0 },
{ X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
{ X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
{ X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
{ X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
{ X86::VPBLENDVBYrr, X86::VPBLENDVBYrm, 0 },
{ X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
{ X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
{ X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
{ X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
{ X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
{ X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
{ X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
{ X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
{ X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
{ X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
{ X86::VPERMDYrr, X86::VPERMDYrm, 0 },
{ X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
{ X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
{ X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
{ X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
{ X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
{ X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
{ X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
{ X86::VPMADDUBSWYrr, X86::VPMADDUBSWYrm, 0 },
{ X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
{ X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
{ X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
{ X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
{ X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
{ X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
{ X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
{ X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
{ X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
{ X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
{ X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
{ X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
{ X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
{ X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
{ X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
{ X86::VPMULHRSWYrr, X86::VPMULHRSWYrm, 0 },
{ X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
{ X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
{ X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
{ X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
{ X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
{ X86::VPORYrr, X86::VPORYrm, 0 },
{ X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
{ X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
{ X86::VPSIGNBYrr256, X86::VPSIGNBYrm256, 0 },
{ X86::VPSIGNWYrr256, X86::VPSIGNWYrm256, 0 },
{ X86::VPSIGNDYrr256, X86::VPSIGNDYrm256, 0 },
{ X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
{ X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
{ X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
{ X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
{ X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
{ X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
{ X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
{ X86::VPSRADYrr, X86::VPSRADYrm, 0 },
{ X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
{ X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
{ X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
{ X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
{ X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
{ X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
{ X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
{ X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
{ X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
{ X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
{ X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
{ X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
{ X86::VPSUBQYrr, X86::VPSUBQYrm, 0 },
{ X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
{ X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
{ X86::VPSUBUSBYrr, X86::VPSUBUSBYrm, 0 },
{ X86::VPSUBUSWYrr, X86::VPSUBUSWYrm, 0 },
{ X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
{ X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
{ X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
{ X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
{ X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
{ X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
{ X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
{ X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
{ X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
{ X86::VPXORYrr, X86::VPXORYrm, 0 },
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4mr, TB_ALIGN_NONE },
{ X86::VFMADDSS4rr_Int, X86::VFMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4mr, TB_ALIGN_NONE },
{ X86::VFMADDSD4rr_Int, X86::VFMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4mr, TB_ALIGN_NONE },
{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Ymr, TB_ALIGN_NONE },
{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4mr, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4mr, TB_ALIGN_NONE },
{ X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4mr, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4mr, TB_ALIGN_NONE },
{ X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Ymr, TB_ALIGN_NONE },
{ X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4mr, TB_ALIGN_NONE },
{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4mr, TB_ALIGN_NONE },
{ X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4mr_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4mr, TB_ALIGN_NONE },
{ X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Ymr, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4mr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4mr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Ymr, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4mr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4mr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Ymr, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Ymr, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrmr, 0 },
{ X86::VPCMOVrrrY, X86::VPCMOVrmrY, 0 },
{ X86::VPCOMBri, X86::VPCOMBmi, 0 },
{ X86::VPCOMDri, X86::VPCOMDmi, 0 },
{ X86::VPCOMQri, X86::VPCOMQmi, 0 },
{ X86::VPCOMWri, X86::VPCOMWmi, 0 },
{ X86::VPCOMUBri, X86::VPCOMUBmi, 0 },
{ X86::VPCOMUDri, X86::VPCOMUDmi, 0 },
{ X86::VPCOMUQri, X86::VPCOMUQmi, 0 },
{ X86::VPCOMUWri, X86::VPCOMUWmi, 0 },
{ X86::VPERMIL2PDrr, X86::VPERMIL2PDmr, 0 },
{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDmrY, 0 },
{ X86::VPERMIL2PSrr, X86::VPERMIL2PSmr, 0 },
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSmrY, 0 },
{ X86::VPMACSDDrr, X86::VPMACSDDrm, 0 },
{ X86::VPMACSDQHrr, X86::VPMACSDQHrm, 0 },
{ X86::VPMACSDQLrr, X86::VPMACSDQLrm, 0 },
{ X86::VPMACSSDDrr, X86::VPMACSSDDrm, 0 },
{ X86::VPMACSSDQHrr, X86::VPMACSSDQHrm, 0 },
{ X86::VPMACSSDQLrr, X86::VPMACSSDQLrm, 0 },
{ X86::VPMACSSWDrr, X86::VPMACSSWDrm, 0 },
{ X86::VPMACSSWWrr, X86::VPMACSSWWrm, 0 },
{ X86::VPMACSWDrr, X86::VPMACSWDrm, 0 },
{ X86::VPMACSWWrr, X86::VPMACSWWrm, 0 },
{ X86::VPMADCSSWDrr, X86::VPMADCSSWDrm, 0 },
{ X86::VPMADCSWDrr, X86::VPMADCSWDrm, 0 },
{ X86::VPPERMrrr, X86::VPPERMrmr, 0 },
{ X86::VPROTBrr, X86::VPROTBrm, 0 },
{ X86::VPROTDrr, X86::VPROTDrm, 0 },
{ X86::VPROTQrr, X86::VPROTQrm, 0 },
{ X86::VPROTWrr, X86::VPROTWrm, 0 },
{ X86::VPSHABrr, X86::VPSHABrm, 0 },
{ X86::VPSHADrr, X86::VPSHADrm, 0 },
{ X86::VPSHAQrr, X86::VPSHAQrm, 0 },
{ X86::VPSHAWrr, X86::VPSHAWrm, 0 },
{ X86::VPSHLBrr, X86::VPSHLBrm, 0 },
{ X86::VPSHLDrr, X86::VPSHLDrm, 0 },
{ X86::VPSHLQrr, X86::VPSHLQrm, 0 },
{ X86::VPSHLWrr, X86::VPSHLWrm, 0 },
// BMI/BMI2 foldable instructions
{ X86::ANDN32rr, X86::ANDN32rm, 0 },
{ X86::ANDN64rr, X86::ANDN64rm, 0 },
{ X86::MULX32rr, X86::MULX32rm, 0 },
{ X86::MULX64rr, X86::MULX64rm, 0 },
{ X86::PDEP32rr, X86::PDEP32rm, 0 },
{ X86::PDEP64rr, X86::PDEP64rm, 0 },
{ X86::PEXT32rr, X86::PEXT32rm, 0 },
{ X86::PEXT64rr, X86::PEXT64rm, 0 },
// ADX foldable instructions
{ X86::ADCX32rr, X86::ADCX32rm, 0 },
{ X86::ADCX64rr, X86::ADCX64rm, 0 },
{ X86::ADOX32rr, X86::ADOX32rm, 0 },
{ X86::ADOX64rr, X86::ADOX64rm, 0 },
// AVX-512 foldable instructions
{ X86::VADDPDZrr, X86::VADDPDZrm, 0 },
{ X86::VADDPSZrr, X86::VADDPSZrm, 0 },
{ X86::VADDSDZrr, X86::VADDSDZrm, 0 },
{ X86::VADDSDZrr_Int, X86::VADDSDZrm_Int, TB_NO_REVERSE },
{ X86::VADDSSZrr, X86::VADDSSZrm, 0 },
{ X86::VADDSSZrr_Int, X86::VADDSSZrm_Int, TB_NO_REVERSE },
{ X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
{ X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
{ X86::VANDNPDZrr, X86::VANDNPDZrm, 0 },
{ X86::VANDNPSZrr, X86::VANDNPSZrm, 0 },
{ X86::VANDPDZrr, X86::VANDPDZrm, 0 },
{ X86::VANDPSZrr, X86::VANDPSZrm, 0 },
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
{ X86::VCMPPDZrri, X86::VCMPPDZrmi, 0 },
{ X86::VCMPPSZrri, X86::VCMPPSZrmi, 0 },
{ X86::VCMPSDZrr, X86::VCMPSDZrm, 0 },
{ X86::VCMPSDZrr_Int, X86::VCMPSDZrm_Int, TB_NO_REVERSE },
{ X86::VCMPSSZrr, X86::VCMPSSZrm, 0 },
{ X86::VCMPSSZrr_Int, X86::VCMPSSZrm_Int, TB_NO_REVERSE },
{ X86::VDIVPDZrr, X86::VDIVPDZrm, 0 },
{ X86::VDIVPSZrr, X86::VDIVPSZrm, 0 },
{ X86::VDIVSDZrr, X86::VDIVSDZrm, 0 },
{ X86::VDIVSDZrr_Int, X86::VDIVSDZrm_Int, TB_NO_REVERSE },
{ X86::VDIVSSZrr, X86::VDIVSSZrm, 0 },
{ X86::VDIVSSZrr_Int, X86::VDIVSSZrm_Int, TB_NO_REVERSE },
{ X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrm, 0 },
{ X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrm, 0 },
{ X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrm, 0 },
{ X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrm, 0 },
{ X86::VINSERTI32x4Zrr, X86::VINSERTI32x4Zrm, 0 },
{ X86::VINSERTI32x8Zrr, X86::VINSERTI32x8Zrm, 0 },
{ X86::VINSERTI64x2Zrr, X86::VINSERTI64x2Zrm, 0 },
{ X86::VINSERTI64x4Zrr, X86::VINSERTI64x4Zrm, 0 },
{ X86::VMAXCPDZrr, X86::VMAXCPDZrm, 0 },
{ X86::VMAXCPSZrr, X86::VMAXCPSZrm, 0 },
{ X86::VMAXCSDZrr, X86::VMAXCSDZrm, 0 },
{ X86::VMAXCSSZrr, X86::VMAXCSSZrm, 0 },
{ X86::VMAXPDZrr, X86::VMAXPDZrm, 0 },
{ X86::VMAXPSZrr, X86::VMAXPSZrm, 0 },
{ X86::VMAXSDZrr, X86::VMAXSDZrm, 0 },
{ X86::VMAXSDZrr_Int, X86::VMAXSDZrm_Int, TB_NO_REVERSE },
{ X86::VMAXSSZrr, X86::VMAXSSZrm, 0 },
{ X86::VMAXSSZrr_Int, X86::VMAXSSZrm_Int, TB_NO_REVERSE },
{ X86::VMINCPDZrr, X86::VMINCPDZrm, 0 },
{ X86::VMINCPSZrr, X86::VMINCPSZrm, 0 },
{ X86::VMINCSDZrr, X86::VMINCSDZrm, 0 },
{ X86::VMINCSSZrr, X86::VMINCSSZrm, 0 },
{ X86::VMINPDZrr, X86::VMINPDZrm, 0 },
{ X86::VMINPSZrr, X86::VMINPSZrm, 0 },
{ X86::VMINSDZrr, X86::VMINSDZrm, 0 },
{ X86::VMINSDZrr_Int, X86::VMINSDZrm_Int, TB_NO_REVERSE },
{ X86::VMINSSZrr, X86::VMINSSZrm, 0 },
{ X86::VMINSSZrr_Int, X86::VMINSSZrm_Int, TB_NO_REVERSE },
{ X86::VMULPDZrr, X86::VMULPDZrm, 0 },
{ X86::VMULPSZrr, X86::VMULPSZrm, 0 },
{ X86::VMULSDZrr, X86::VMULSDZrm, 0 },
{ X86::VMULSDZrr_Int, X86::VMULSDZrm_Int, TB_NO_REVERSE },
{ X86::VMULSSZrr, X86::VMULSSZrm, 0 },
{ X86::VMULSSZrr_Int, X86::VMULSSZrm_Int, TB_NO_REVERSE },
{ X86::VORPDZrr, X86::VORPDZrm, 0 },
{ X86::VORPSZrr, X86::VORPSZrm, 0 },
{ X86::VPADDBZrr, X86::VPADDBZrm, 0 },
{ X86::VPADDDZrr, X86::VPADDDZrm, 0 },
{ X86::VPADDQZrr, X86::VPADDQZrm, 0 },
{ X86::VPADDSBZrr, X86::VPADDSBZrm, 0 },
{ X86::VPADDSWZrr, X86::VPADDSWZrm, 0 },
{ X86::VPADDUSBZrr, X86::VPADDUSBZrm, 0 },
{ X86::VPADDUSWZrr, X86::VPADDUSWZrm, 0 },
{ X86::VPADDWZrr, X86::VPADDWZrm, 0 },
{ X86::VPALIGNRZrri, X86::VPALIGNRZrmi, 0 },
{ X86::VPANDDZrr, X86::VPANDDZrm, 0 },
{ X86::VPANDNDZrr, X86::VPANDNDZrm, 0 },
{ X86::VPANDNQZrr, X86::VPANDNQZrm, 0 },
{ X86::VPANDQZrr, X86::VPANDQZrm, 0 },
{ X86::VPCMPBZrri, X86::VPCMPBZrmi, 0 },
{ X86::VPCMPDZrri, X86::VPCMPDZrmi, 0 },
{ X86::VPCMPEQBZrr, X86::VPCMPEQBZrm, 0 },
{ X86::VPCMPEQDZrr, X86::VPCMPEQDZrm, 0 },
{ X86::VPCMPEQQZrr, X86::VPCMPEQQZrm, 0 },
{ X86::VPCMPEQWZrr, X86::VPCMPEQWZrm, 0 },
{ X86::VPCMPGTBZrr, X86::VPCMPGTBZrm, 0 },
{ X86::VPCMPGTDZrr, X86::VPCMPGTDZrm, 0 },
{ X86::VPCMPGTQZrr, X86::VPCMPGTQZrm, 0 },
{ X86::VPCMPGTWZrr, X86::VPCMPGTWZrm, 0 },
{ X86::VPCMPQZrri, X86::VPCMPQZrmi, 0 },
{ X86::VPCMPUBZrri, X86::VPCMPUBZrmi, 0 },
{ X86::VPCMPUDZrri, X86::VPCMPUDZrmi, 0 },
{ X86::VPCMPUQZrri, X86::VPCMPUQZrmi, 0 },
{ X86::VPCMPUWZrri, X86::VPCMPUWZrmi, 0 },
{ X86::VPCMPWZrri, X86::VPCMPWZrmi, 0 },
{ X86::VPERMBZrr, X86::VPERMBZrm, 0 },
{ X86::VPERMDZrr, X86::VPERMDZrm, 0 },
{ X86::VPERMILPDZrr, X86::VPERMILPDZrm, 0 },
{ X86::VPERMILPSZrr, X86::VPERMILPSZrm, 0 },
{ X86::VPERMPDZrr, X86::VPERMPDZrm, 0 },
{ X86::VPERMPSZrr, X86::VPERMPSZrm, 0 },
{ X86::VPERMQZrr, X86::VPERMQZrm, 0 },
{ X86::VPERMWZrr, X86::VPERMWZrm, 0 },
{ X86::VPMADDUBSWZrr, X86::VPMADDUBSWZrm, 0 },
{ X86::VPMADDWDZrr, X86::VPMADDWDZrm, 0 },
{ X86::VPMAXSDZrr, X86::VPMAXSDZrm, 0 },
{ X86::VPMAXSQZrr, X86::VPMAXSQZrm, 0 },
{ X86::VPMAXUDZrr, X86::VPMAXUDZrm, 0 },
{ X86::VPMAXUQZrr, X86::VPMAXUQZrm, 0 },
{ X86::VPMINSDZrr, X86::VPMINSDZrm, 0 },
{ X86::VPMINSQZrr, X86::VPMINSQZrm, 0 },
{ X86::VPMINUDZrr, X86::VPMINUDZrm, 0 },
{ X86::VPMINUQZrr, X86::VPMINUQZrm, 0 },
{ X86::VPMULDQZrr, X86::VPMULDQZrm, 0 },
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VPORDZrr, X86::VPORDZrm, 0 },
{ X86::VPORQZrr, X86::VPORQZrm, 0 },
{ X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
{ X86::VPSLLVDZrr, X86::VPSLLVDZrm, 0 },
{ X86::VPSLLVQZrr, X86::VPSLLVQZrm, 0 },
{ X86::VPSRAVDZrr, X86::VPSRAVDZrm, 0 },
{ X86::VPSRLVDZrr, X86::VPSRLVDZrm, 0 },
{ X86::VPSRLVQZrr, X86::VPSRLVQZrm, 0 },
{ X86::VPSUBBZrr, X86::VPSUBBZrm, 0 },
{ X86::VPSUBDZrr, X86::VPSUBDZrm, 0 },
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
{ X86::VPSUBSBZrr, X86::VPSUBSBZrm, 0 },
{ X86::VPSUBSWZrr, X86::VPSUBSWZrm, 0 },
{ X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 },
{ X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 },
{ X86::VPSUBWZrr, X86::VPSUBWZrm, 0 },
{ X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 },
{ X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 },
{ X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 },
{ X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 },
{ X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 },
{ X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 },
{ X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 },
{ X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 },
{ X86::VPXORDZrr, X86::VPXORDZrm, 0 },
{ X86::VPXORQZrr, X86::VPXORQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
{ X86::VSUBPDZrr, X86::VSUBPDZrm, 0 },
{ X86::VSUBPSZrr, X86::VSUBPSZrm, 0 },
{ X86::VSUBSDZrr, X86::VSUBSDZrm, 0 },
{ X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE },
{ X86::VSUBSSZrr, X86::VSUBSSZrm, 0 },
{ X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE },
{ X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 },
{ X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 },
{ X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 },
{ X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 },
{ X86::VXORPDZrr, X86::VXORPDZrm, 0 },
{ X86::VXORPSZrr, X86::VXORPSZrm, 0 },
// AVX-512{F,VL} foldable instructions
{ X86::VADDPDZ128rr, X86::VADDPDZ128rm, 0 },
{ X86::VADDPDZ256rr, X86::VADDPDZ256rm, 0 },
{ X86::VADDPSZ128rr, X86::VADDPSZ128rm, 0 },
{ X86::VADDPSZ256rr, X86::VADDPSZ256rm, 0 },
{ X86::VALIGNDZ128rri, X86::VALIGNDZ128rmi, 0 },
{ X86::VALIGNDZ256rri, X86::VALIGNDZ256rmi, 0 },
{ X86::VALIGNQZ128rri, X86::VALIGNQZ128rmi, 0 },
{ X86::VALIGNQZ256rri, X86::VALIGNQZ256rmi, 0 },
{ X86::VANDNPDZ128rr, X86::VANDNPDZ128rm, 0 },
{ X86::VANDNPDZ256rr, X86::VANDNPDZ256rm, 0 },
{ X86::VANDNPSZ128rr, X86::VANDNPSZ128rm, 0 },
{ X86::VANDNPSZ256rr, X86::VANDNPSZ256rm, 0 },
{ X86::VANDPDZ128rr, X86::VANDPDZ128rm, 0 },
{ X86::VANDPDZ256rr, X86::VANDPDZ256rm, 0 },
{ X86::VANDPSZ128rr, X86::VANDPSZ128rm, 0 },
{ X86::VANDPSZ256rr, X86::VANDPSZ256rm, 0 },
{ X86::VBROADCASTSSZ128rkz, X86::VBROADCASTSSZ128mkz, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ256rkz, X86::VBROADCASTSSZ256mkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256rkz, X86::VBROADCASTSDZ256mkz, TB_NO_REVERSE },
{ X86::VCMPPDZ128rri, X86::VCMPPDZ128rmi, 0 },
{ X86::VCMPPDZ256rri, X86::VCMPPDZ256rmi, 0 },
{ X86::VCMPPSZ128rri, X86::VCMPPSZ128rmi, 0 },
{ X86::VCMPPSZ256rri, X86::VCMPPSZ256rmi, 0 },
{ X86::VDIVPDZ128rr, X86::VDIVPDZ128rm, 0 },
{ X86::VDIVPDZ256rr, X86::VDIVPDZ256rm, 0 },
{ X86::VDIVPSZ128rr, X86::VDIVPSZ128rm, 0 },
{ X86::VDIVPSZ256rr, X86::VDIVPSZ256rm, 0 },
{ X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm, 0 },
{ X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm, 0 },
{ X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm, 0 },
{ X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm, 0 },
{ X86::VMAXCPDZ128rr, X86::VMAXCPDZ128rm, 0 },
{ X86::VMAXCPDZ256rr, X86::VMAXCPDZ256rm, 0 },
{ X86::VMAXCPSZ128rr, X86::VMAXCPSZ128rm, 0 },
{ X86::VMAXCPSZ256rr, X86::VMAXCPSZ256rm, 0 },
{ X86::VMAXPDZ128rr, X86::VMAXPDZ128rm, 0 },
{ X86::VMAXPDZ256rr, X86::VMAXPDZ256rm, 0 },
{ X86::VMAXPSZ128rr, X86::VMAXPSZ128rm, 0 },
{ X86::VMAXPSZ256rr, X86::VMAXPSZ256rm, 0 },
{ X86::VMINCPDZ128rr, X86::VMINCPDZ128rm, 0 },
{ X86::VMINCPDZ256rr, X86::VMINCPDZ256rm, 0 },
{ X86::VMINCPSZ128rr, X86::VMINCPSZ128rm, 0 },
{ X86::VMINCPSZ256rr, X86::VMINCPSZ256rm, 0 },
{ X86::VMINPDZ128rr, X86::VMINPDZ128rm, 0 },
{ X86::VMINPDZ256rr, X86::VMINPDZ256rm, 0 },
{ X86::VMINPSZ128rr, X86::VMINPSZ128rm, 0 },
{ X86::VMINPSZ256rr, X86::VMINPSZ256rm, 0 },
{ X86::VMULPDZ128rr, X86::VMULPDZ128rm, 0 },
{ X86::VMULPDZ256rr, X86::VMULPDZ256rm, 0 },
{ X86::VMULPSZ128rr, X86::VMULPSZ128rm, 0 },
{ X86::VMULPSZ256rr, X86::VMULPSZ256rm, 0 },
{ X86::VORPDZ128rr, X86::VORPDZ128rm, 0 },
{ X86::VORPDZ256rr, X86::VORPDZ256rm, 0 },
{ X86::VORPSZ128rr, X86::VORPSZ128rm, 0 },
{ X86::VORPSZ256rr, X86::VORPSZ256rm, 0 },
{ X86::VPADDBZ128rr, X86::VPADDBZ128rm, 0 },
{ X86::VPADDBZ256rr, X86::VPADDBZ256rm, 0 },
{ X86::VPADDDZ128rr, X86::VPADDDZ128rm, 0 },
{ X86::VPADDDZ256rr, X86::VPADDDZ256rm, 0 },
{ X86::VPADDQZ128rr, X86::VPADDQZ128rm, 0 },
{ X86::VPADDQZ256rr, X86::VPADDQZ256rm, 0 },
{ X86::VPADDSBZ128rr, X86::VPADDSBZ128rm, 0 },
{ X86::VPADDSBZ256rr, X86::VPADDSBZ256rm, 0 },
{ X86::VPADDSWZ128rr, X86::VPADDSWZ128rm, 0 },
{ X86::VPADDSWZ256rr, X86::VPADDSWZ256rm, 0 },
{ X86::VPADDUSBZ128rr, X86::VPADDUSBZ128rm, 0 },
{ X86::VPADDUSBZ256rr, X86::VPADDUSBZ256rm, 0 },
{ X86::VPADDUSWZ128rr, X86::VPADDUSWZ128rm, 0 },
{ X86::VPADDUSWZ256rr, X86::VPADDUSWZ256rm, 0 },
{ X86::VPADDWZ128rr, X86::VPADDWZ128rm, 0 },
{ X86::VPADDWZ256rr, X86::VPADDWZ256rm, 0 },
{ X86::VPALIGNRZ128rri, X86::VPALIGNRZ128rmi, 0 },
{ X86::VPALIGNRZ256rri, X86::VPALIGNRZ256rmi, 0 },
{ X86::VPANDDZ128rr, X86::VPANDDZ128rm, 0 },
{ X86::VPANDDZ256rr, X86::VPANDDZ256rm, 0 },
{ X86::VPANDNDZ128rr, X86::VPANDNDZ128rm, 0 },
{ X86::VPANDNDZ256rr, X86::VPANDNDZ256rm, 0 },
{ X86::VPANDNQZ128rr, X86::VPANDNQZ128rm, 0 },
{ X86::VPANDNQZ256rr, X86::VPANDNQZ256rm, 0 },
{ X86::VPANDQZ128rr, X86::VPANDQZ128rm, 0 },
{ X86::VPANDQZ256rr, X86::VPANDQZ256rm, 0 },
{ X86::VPCMPBZ128rri, X86::VPCMPBZ128rmi, 0 },
{ X86::VPCMPBZ256rri, X86::VPCMPBZ256rmi, 0 },
{ X86::VPCMPDZ128rri, X86::VPCMPDZ128rmi, 0 },
{ X86::VPCMPDZ256rri, X86::VPCMPDZ256rmi, 0 },
{ X86::VPCMPEQBZ128rr, X86::VPCMPEQBZ128rm, 0 },
{ X86::VPCMPEQBZ256rr, X86::VPCMPEQBZ256rm, 0 },
{ X86::VPCMPEQDZ128rr, X86::VPCMPEQDZ128rm, 0 },
{ X86::VPCMPEQDZ256rr, X86::VPCMPEQDZ256rm, 0 },
{ X86::VPCMPEQQZ128rr, X86::VPCMPEQQZ128rm, 0 },
{ X86::VPCMPEQQZ256rr, X86::VPCMPEQQZ256rm, 0 },
{ X86::VPCMPEQWZ128rr, X86::VPCMPEQWZ128rm, 0 },
{ X86::VPCMPEQWZ256rr, X86::VPCMPEQWZ256rm, 0 },
{ X86::VPCMPGTBZ128rr, X86::VPCMPGTBZ128rm, 0 },
{ X86::VPCMPGTBZ256rr, X86::VPCMPGTBZ256rm, 0 },
{ X86::VPCMPGTDZ128rr, X86::VPCMPGTDZ128rm, 0 },
{ X86::VPCMPGTDZ256rr, X86::VPCMPGTDZ256rm, 0 },
{ X86::VPCMPGTQZ128rr, X86::VPCMPGTQZ128rm, 0 },
{ X86::VPCMPGTQZ256rr, X86::VPCMPGTQZ256rm, 0 },
{ X86::VPCMPGTWZ128rr, X86::VPCMPGTWZ128rm, 0 },
{ X86::VPCMPGTWZ256rr, X86::VPCMPGTWZ256rm, 0 },
{ X86::VPCMPQZ128rri, X86::VPCMPQZ128rmi, 0 },
{ X86::VPCMPQZ256rri, X86::VPCMPQZ256rmi, 0 },
{ X86::VPCMPUBZ128rri, X86::VPCMPUBZ128rmi, 0 },
{ X86::VPCMPUBZ256rri, X86::VPCMPUBZ256rmi, 0 },
{ X86::VPCMPUDZ128rri, X86::VPCMPUDZ128rmi, 0 },
{ X86::VPCMPUDZ256rri, X86::VPCMPUDZ256rmi, 0 },
{ X86::VPCMPUQZ128rri, X86::VPCMPUQZ128rmi, 0 },
{ X86::VPCMPUQZ256rri, X86::VPCMPUQZ256rmi, 0 },
{ X86::VPCMPUWZ128rri, X86::VPCMPUWZ128rmi, 0 },
{ X86::VPCMPUWZ256rri, X86::VPCMPUWZ256rmi, 0 },
{ X86::VPCMPWZ128rri, X86::VPCMPWZ128rmi, 0 },
{ X86::VPCMPWZ256rri, X86::VPCMPWZ256rmi, 0 },
{ X86::VPERMBZ128rr, X86::VPERMBZ128rm, 0 },
{ X86::VPERMBZ256rr, X86::VPERMBZ256rm, 0 },
{ X86::VPERMDZ256rr, X86::VPERMDZ256rm, 0 },
{ X86::VPERMILPDZ128rr, X86::VPERMILPDZ128rm, 0 },
{ X86::VPERMILPDZ256rr, X86::VPERMILPDZ256rm, 0 },
{ X86::VPERMILPSZ128rr, X86::VPERMILPSZ128rm, 0 },
{ X86::VPERMILPSZ256rr, X86::VPERMILPSZ256rm, 0 },
{ X86::VPERMPDZ256rr, X86::VPERMPDZ256rm, 0 },
{ X86::VPERMPSZ256rr, X86::VPERMPSZ256rm, 0 },
{ X86::VPERMQZ256rr, X86::VPERMQZ256rm, 0 },
{ X86::VPERMWZ128rr, X86::VPERMWZ128rm, 0 },
{ X86::VPERMWZ256rr, X86::VPERMWZ256rm, 0 },
{ X86::VPMADDUBSWZ128rr, X86::VPMADDUBSWZ128rm, 0 },
{ X86::VPMADDUBSWZ256rr, X86::VPMADDUBSWZ256rm, 0 },
{ X86::VPMADDWDZ128rr, X86::VPMADDWDZ128rm, 0 },
{ X86::VPMADDWDZ256rr, X86::VPMADDWDZ256rm, 0 },
{ X86::VPORDZ128rr, X86::VPORDZ128rm, 0 },
{ X86::VPORDZ256rr, X86::VPORDZ256rm, 0 },
{ X86::VPORQZ128rr, X86::VPORQZ128rm, 0 },
{ X86::VPORQZ256rr, X86::VPORQZ256rm, 0 },
{ X86::VPSHUFBZ128rr, X86::VPSHUFBZ128rm, 0 },
{ X86::VPSHUFBZ256rr, X86::VPSHUFBZ256rm, 0 },
{ X86::VPSUBBZ128rr, X86::VPSUBBZ128rm, 0 },
{ X86::VPSUBBZ256rr, X86::VPSUBBZ256rm, 0 },
{ X86::VPSUBDZ128rr, X86::VPSUBDZ128rm, 0 },
{ X86::VPSUBDZ256rr, X86::VPSUBDZ256rm, 0 },
{ X86::VPSUBQZ128rr, X86::VPSUBQZ128rm, 0 },
{ X86::VPSUBQZ256rr, X86::VPSUBQZ256rm, 0 },
{ X86::VPSUBSBZ128rr, X86::VPSUBSBZ128rm, 0 },
{ X86::VPSUBSBZ256rr, X86::VPSUBSBZ256rm, 0 },
{ X86::VPSUBSWZ128rr, X86::VPSUBSWZ128rm, 0 },
{ X86::VPSUBSWZ256rr, X86::VPSUBSWZ256rm, 0 },
{ X86::VPSUBUSBZ128rr, X86::VPSUBUSBZ128rm, 0 },
{ X86::VPSUBUSBZ256rr, X86::VPSUBUSBZ256rm, 0 },
{ X86::VPSUBUSWZ128rr, X86::VPSUBUSWZ128rm, 0 },
{ X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 },
{ X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 },
{ X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 },
{ X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 },
{ X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 },
{ X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 },
{ X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 },
{ X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 },
{ X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 },
{ X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 },
{ X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 },
{ X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 },
{ X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 },
{ X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 },
{ X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 },
{ X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 },
{ X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 },
{ X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 },
{ X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 },
{ X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 },
{ X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 },
{ X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 },
{ X86::VPXORQZ256rr, X86::VPXORQZ256rm, 0 },
{ X86::VSUBPDZ128rr, X86::VSUBPDZ128rm, 0 },
{ X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 },
{ X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 },
{ X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 },
{ X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 },
{ X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 },
{ X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 },
{ X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 },
{ X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 },
{ X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 },
{ X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 },
{ X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 },
{ X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 },
{ X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 },
{ X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 },
{ X86::VXORPSZ256rr, X86::VXORPSZ256rm, 0 },
// AVX-512 masked foldable instructions
{ X86::VPERMILPDZrikz, X86::VPERMILPDZmikz, 0 },
{ X86::VPERMILPSZrikz, X86::VPERMILPSZmikz, 0 },
{ X86::VPERMPDZrikz, X86::VPERMPDZmikz, 0 },
{ X86::VPERMQZrikz, X86::VPERMQZmikz, 0 },
{ X86::VPMOVSXBDZrrkz, X86::VPMOVSXBDZrmkz, 0 },
{ X86::VPMOVSXBQZrrkz, X86::VPMOVSXBQZrmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrkz, X86::VPMOVSXBWZrmkz, 0 },
{ X86::VPMOVSXDQZrrkz, X86::VPMOVSXDQZrmkz, 0 },
{ X86::VPMOVSXWDZrrkz, X86::VPMOVSXWDZrmkz, 0 },
{ X86::VPMOVSXWQZrrkz, X86::VPMOVSXWQZrmkz, 0 },
{ X86::VPMOVZXBDZrrkz, X86::VPMOVZXBDZrmkz, 0 },
{ X86::VPMOVZXBQZrrkz, X86::VPMOVZXBQZrmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBWZrrkz, X86::VPMOVZXBWZrmkz, 0 },
{ X86::VPMOVZXDQZrrkz, X86::VPMOVZXDQZrmkz, 0 },
{ X86::VPMOVZXWDZrrkz, X86::VPMOVZXWDZrmkz, 0 },
{ X86::VPMOVZXWQZrrkz, X86::VPMOVZXWQZrmkz, 0 },
{ X86::VPSHUFDZrikz, X86::VPSHUFDZmikz, 0 },
{ X86::VPSHUFHWZrikz, X86::VPSHUFHWZmikz, 0 },
{ X86::VPSHUFLWZrikz, X86::VPSHUFLWZmikz, 0 },
// AVX-512VL 256-bit masked foldable instructions
{ X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz, 0 },
{ X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz, 0 },
{ X86::VPERMPDZ256rikz, X86::VPERMPDZ256mikz, 0 },
{ X86::VPERMQZ256rikz, X86::VPERMQZ256mikz, 0 },
{ X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz, 0 },
{ X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz, 0 },
{ X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz, 0 },
{ X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz, 0 },
{ X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz, 0 },
{ X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz, 0 },
{ X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz, TB_NO_REVERSE },
{ X86::VPSHUFDZ256rikz, X86::VPSHUFDZ256mikz, 0 },
{ X86::VPSHUFHWZ256rikz, X86::VPSHUFHWZ256mikz, 0 },
{ X86::VPSHUFLWZ256rikz, X86::VPSHUFLWZ256mikz, 0 },
// AVX-512VL 128-bit masked foldable instructions
{ X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz, 0 },
{ X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz, 0 },
{ X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz, TB_NO_REVERSE },
{ X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz, TB_NO_REVERSE },
{ X86::VPSHUFDZ128rikz, X86::VPSHUFDZ128mikz, 0 },
{ X86::VPSHUFHWZ128rikz, X86::VPSHUFHWZ128mikz, 0 },
{ X86::VPSHUFLWZ128rikz, X86::VPSHUFLWZ128mikz, 0 },
// AES foldable instructions
{ X86::AESDECLASTrr, X86::AESDECLASTrm, TB_ALIGN_16 },
{ X86::AESDECrr, X86::AESDECrm, TB_ALIGN_16 },
{ X86::AESENCLASTrr, X86::AESENCLASTrm, TB_ALIGN_16 },
{ X86::AESENCrr, X86::AESENCrm, TB_ALIGN_16 },
{ X86::VAESDECLASTrr, X86::VAESDECLASTrm, 0 },
{ X86::VAESDECrr, X86::VAESDECrm, 0 },
{ X86::VAESENCLASTrr, X86::VAESENCLASTrm, 0 },
{ X86::VAESENCrr, X86::VAESENCrm, 0 },
// SHA foldable instructions
{ X86::SHA1MSG1rr, X86::SHA1MSG1rm, TB_ALIGN_16 },
{ X86::SHA1MSG2rr, X86::SHA1MSG2rm, TB_ALIGN_16 },
{ X86::SHA1NEXTErr, X86::SHA1NEXTErm, TB_ALIGN_16 },
{ X86::SHA1RNDS4rri, X86::SHA1RNDS4rmi, TB_ALIGN_16 },
{ X86::SHA256MSG1rr, X86::SHA256MSG1rm, TB_ALIGN_16 },
{ X86::SHA256MSG2rr, X86::SHA256MSG2rm, TB_ALIGN_16 },
{ X86::SHA256RNDS2rr, X86::SHA256RNDS2rm, TB_ALIGN_16 }
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 2, folded load
Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
}
static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
// FMA4 foldable patterns
{ X86::VFMADDSS4rr, X86::VFMADDSS4rm, TB_ALIGN_NONE },
{ X86::VFMADDSS4rr_Int, X86::VFMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDSD4rr, X86::VFMADDSD4rm, TB_ALIGN_NONE },
{ X86::VFMADDSD4rr_Int, X86::VFMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMADDPS4rr, X86::VFMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDPD4rr, X86::VFMADDPD4rm, TB_ALIGN_NONE },
{ X86::VFMADDPS4Yrr, X86::VFMADDPS4Yrm, TB_ALIGN_NONE },
{ X86::VFMADDPD4Yrr, X86::VFMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr, X86::VFNMADDSS4rm, TB_ALIGN_NONE },
{ X86::VFNMADDSS4rr_Int, X86::VFNMADDSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDSD4rr, X86::VFNMADDSD4rm, TB_ALIGN_NONE },
{ X86::VFNMADDSD4rr_Int, X86::VFNMADDSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMADDPS4rr, X86::VFNMADDPS4rm, TB_ALIGN_NONE },
{ X86::VFNMADDPD4rr, X86::VFNMADDPD4rm, TB_ALIGN_NONE },
{ X86::VFNMADDPS4Yrr, X86::VFNMADDPS4Yrm, TB_ALIGN_NONE },
{ X86::VFNMADDPD4Yrr, X86::VFNMADDPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr, X86::VFMSUBSS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBSS4rr_Int, X86::VFMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBSD4rr, X86::VFMSUBSD4rm, TB_ALIGN_NONE },
{ X86::VFMSUBSD4rr_Int, X86::VFMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFMSUBPS4rr, X86::VFMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBPD4rr, X86::VFMSUBPD4rm, TB_ALIGN_NONE },
{ X86::VFMSUBPS4Yrr, X86::VFMSUBPS4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBPD4Yrr, X86::VFMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr, X86::VFNMSUBSS4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBSS4rr_Int, X86::VFNMSUBSS4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBSD4rr, X86::VFNMSUBSD4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBSD4rr_Int, X86::VFNMSUBSD4rm_Int, TB_NO_REVERSE },
{ X86::VFNMSUBPS4rr, X86::VFNMSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4rr, X86::VFNMSUBPD4rm, TB_ALIGN_NONE },
{ X86::VFNMSUBPS4Yrr, X86::VFNMSUBPS4Yrm, TB_ALIGN_NONE },
{ X86::VFNMSUBPD4Yrr, X86::VFNMSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4rr, X86::VFMADDSUBPS4rm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4rr, X86::VFMADDSUBPD4rm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPS4Yrr, X86::VFMADDSUBPS4Yrm, TB_ALIGN_NONE },
{ X86::VFMADDSUBPD4Yrr, X86::VFMADDSUBPD4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4rr, X86::VFMSUBADDPS4rm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4rr, X86::VFMSUBADDPD4rm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPS4Yrr, X86::VFMSUBADDPS4Yrm, TB_ALIGN_NONE },
{ X86::VFMSUBADDPD4Yrr, X86::VFMSUBADDPD4Yrm, TB_ALIGN_NONE },
// XOP foldable instructions
{ X86::VPCMOVrrr, X86::VPCMOVrrm, 0 },
{ X86::VPCMOVrrrY, X86::VPCMOVrrmY, 0 },
{ X86::VPERMIL2PDrr, X86::VPERMIL2PDrm, 0 },
{ X86::VPERMIL2PDrrY, X86::VPERMIL2PDrmY, 0 },
{ X86::VPERMIL2PSrr, X86::VPERMIL2PSrm, 0 },
{ X86::VPERMIL2PSrrY, X86::VPERMIL2PSrmY, 0 },
{ X86::VPPERMrrr, X86::VPPERMrrm, 0 },
// AVX-512 instructions with 3 source operands.
{ X86::VBLENDMPDZrr, X86::VBLENDMPDZrm, 0 },
{ X86::VBLENDMPSZrr, X86::VBLENDMPSZrm, 0 },
{ X86::VPBLENDMDZrr, X86::VPBLENDMDZrm, 0 },
{ X86::VPBLENDMQZrr, X86::VPBLENDMQZrm, 0 },
{ X86::VBROADCASTSSZrk, X86::VBROADCASTSSZmk, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrk, X86::VBROADCASTSDZmk, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ256rk, X86::VBROADCASTSSZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256rk, X86::VBROADCASTSDZ256mk, TB_NO_REVERSE },
{ X86::VBROADCASTSSZ128rk, X86::VBROADCASTSSZ128mk, TB_NO_REVERSE },
{ X86::VPERMI2Brr, X86::VPERMI2Brm, 0 },
{ X86::VPERMI2Drr, X86::VPERMI2Drm, 0 },
{ X86::VPERMI2PSrr, X86::VPERMI2PSrm, 0 },
{ X86::VPERMI2PDrr, X86::VPERMI2PDrm, 0 },
{ X86::VPERMI2Qrr, X86::VPERMI2Qrm, 0 },
{ X86::VPERMI2Wrr, X86::VPERMI2Wrm, 0 },
{ X86::VPERMT2Brr, X86::VPERMT2Brm, 0 },
{ X86::VPERMT2Drr, X86::VPERMT2Drm, 0 },
{ X86::VPERMT2PSrr, X86::VPERMT2PSrm, 0 },
{ X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
{ X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
{ X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
{ X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
{ X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
// AVX-512VL 256-bit instructions with 3 source operands.
{ X86::VPERMI2B256rr, X86::VPERMI2B256rm, 0 },
{ X86::VPERMI2D256rr, X86::VPERMI2D256rm, 0 },
{ X86::VPERMI2PD256rr, X86::VPERMI2PD256rm, 0 },
{ X86::VPERMI2PS256rr, X86::VPERMI2PS256rm, 0 },
{ X86::VPERMI2Q256rr, X86::VPERMI2Q256rm, 0 },
{ X86::VPERMI2W256rr, X86::VPERMI2W256rm, 0 },
{ X86::VPERMT2B256rr, X86::VPERMT2B256rm, 0 },
{ X86::VPERMT2D256rr, X86::VPERMT2D256rm, 0 },
{ X86::VPERMT2PD256rr, X86::VPERMT2PD256rm, 0 },
{ X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
{ X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
{ X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
{ X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
{ X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
// AVX-512VL 128-bit instructions with 3 source operands.
{ X86::VPERMI2B128rr, X86::VPERMI2B128rm, 0 },
{ X86::VPERMI2D128rr, X86::VPERMI2D128rm, 0 },
{ X86::VPERMI2PD128rr, X86::VPERMI2PD128rm, 0 },
{ X86::VPERMI2PS128rr, X86::VPERMI2PS128rm, 0 },
{ X86::VPERMI2Q128rr, X86::VPERMI2Q128rm, 0 },
{ X86::VPERMI2W128rr, X86::VPERMI2W128rm, 0 },
{ X86::VPERMT2B128rr, X86::VPERMT2B128rm, 0 },
{ X86::VPERMT2D128rr, X86::VPERMT2D128rm, 0 },
{ X86::VPERMT2PD128rr, X86::VPERMT2PD128rm, 0 },
{ X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
{ X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
{ X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
{ X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
{ X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
// AVX-512 masked instructions
{ X86::VADDPDZrrkz, X86::VADDPDZrmkz, 0 },
{ X86::VADDPSZrrkz, X86::VADDPSZrmkz, 0 },
{ X86::VALIGNDZrrikz, X86::VALIGNDZrmikz, 0 },
{ X86::VALIGNQZrrikz, X86::VALIGNQZrmikz, 0 },
{ X86::VANDNPDZrrkz, X86::VANDNPDZrmkz, 0 },
{ X86::VANDNPSZrrkz, X86::VANDNPSZrmkz, 0 },
{ X86::VANDPDZrrkz, X86::VANDPDZrmkz, 0 },
{ X86::VANDPSZrrkz, X86::VANDPSZrmkz, 0 },
{ X86::VDIVPDZrrkz, X86::VDIVPDZrmkz, 0 },
{ X86::VDIVPSZrrkz, X86::VDIVPSZrmkz, 0 },
{ X86::VINSERTF32x4Zrrkz, X86::VINSERTF32x4Zrmkz, 0 },
{ X86::VINSERTF32x8Zrrkz, X86::VINSERTF32x8Zrmkz, 0 },
{ X86::VINSERTF64x2Zrrkz, X86::VINSERTF64x2Zrmkz, 0 },
{ X86::VINSERTF64x4Zrrkz, X86::VINSERTF64x4Zrmkz, 0 },
{ X86::VINSERTI32x4Zrrkz, X86::VINSERTI32x4Zrmkz, 0 },
{ X86::VINSERTI32x8Zrrkz, X86::VINSERTI32x8Zrmkz, 0 },
{ X86::VINSERTI64x2Zrrkz, X86::VINSERTI64x2Zrmkz, 0 },
{ X86::VINSERTI64x4Zrrkz, X86::VINSERTI64x4Zrmkz, 0 },
{ X86::VMAXCPDZrrkz, X86::VMAXCPDZrmkz, 0 },
{ X86::VMAXCPSZrrkz, X86::VMAXCPSZrmkz, 0 },
{ X86::VMAXPDZrrkz, X86::VMAXPDZrmkz, 0 },
{ X86::VMAXPSZrrkz, X86::VMAXPSZrmkz, 0 },
{ X86::VMINCPDZrrkz, X86::VMINCPDZrmkz, 0 },
{ X86::VMINCPSZrrkz, X86::VMINCPSZrmkz, 0 },
{ X86::VMINPDZrrkz, X86::VMINPDZrmkz, 0 },
{ X86::VMINPSZrrkz, X86::VMINPSZrmkz, 0 },
{ X86::VMULPDZrrkz, X86::VMULPDZrmkz, 0 },
{ X86::VMULPSZrrkz, X86::VMULPSZrmkz, 0 },
{ X86::VORPDZrrkz, X86::VORPDZrmkz, 0 },
{ X86::VORPSZrrkz, X86::VORPSZrmkz, 0 },
{ X86::VPADDBZrrkz, X86::VPADDBZrmkz, 0 },
{ X86::VPADDDZrrkz, X86::VPADDDZrmkz, 0 },
{ X86::VPADDQZrrkz, X86::VPADDQZrmkz, 0 },
{ X86::VPADDSBZrrkz, X86::VPADDSBZrmkz, 0 },
{ X86::VPADDSWZrrkz, X86::VPADDSWZrmkz, 0 },
{ X86::VPADDUSBZrrkz, X86::VPADDUSBZrmkz, 0 },
{ X86::VPADDUSWZrrkz, X86::VPADDUSWZrmkz, 0 },
{ X86::VPADDWZrrkz, X86::VPADDWZrmkz, 0 },
{ X86::VPALIGNRZrrikz, X86::VPALIGNRZrmikz, 0 },
{ X86::VPANDDZrrkz, X86::VPANDDZrmkz, 0 },
{ X86::VPANDNDZrrkz, X86::VPANDNDZrmkz, 0 },
{ X86::VPANDNQZrrkz, X86::VPANDNQZrmkz, 0 },
{ X86::VPANDQZrrkz, X86::VPANDQZrmkz, 0 },
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },
{ X86::VPERMDZrrkz, X86::VPERMDZrmkz, 0 },
{ X86::VPERMILPDZrrkz, X86::VPERMILPDZrmkz, 0 },
{ X86::VPERMILPSZrrkz, X86::VPERMILPSZrmkz, 0 },
{ X86::VPERMPDZrrkz, X86::VPERMPDZrmkz, 0 },
{ X86::VPERMPSZrrkz, X86::VPERMPSZrmkz, 0 },
{ X86::VPERMQZrrkz, X86::VPERMQZrmkz, 0 },
{ X86::VPERMWZrrkz, X86::VPERMWZrmkz, 0 },
{ X86::VPMADDUBSWZrrkz, X86::VPMADDUBSWZrmkz, 0 },
{ X86::VPMADDWDZrrkz, X86::VPMADDWDZrmkz, 0 },
{ X86::VPORDZrrkz, X86::VPORDZrmkz, 0 },
{ X86::VPORQZrrkz, X86::VPORQZrmkz, 0 },
{ X86::VPSHUFBZrrkz, X86::VPSHUFBZrmkz, 0 },
{ X86::VPSUBBZrrkz, X86::VPSUBBZrmkz, 0 },
{ X86::VPSUBDZrrkz, X86::VPSUBDZrmkz, 0 },
{ X86::VPSUBQZrrkz, X86::VPSUBQZrmkz, 0 },
{ X86::VPSUBSBZrrkz, X86::VPSUBSBZrmkz, 0 },
{ X86::VPSUBSWZrrkz, X86::VPSUBSWZrmkz, 0 },
{ X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 },
{ X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 },
{ X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 },
{ X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 },
{ X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 },
{ X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 },
{ X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 },
{ X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 },
{ X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 },
{ X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 },
{ X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 },
{ X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 },
{ X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 },
{ X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 },
{ X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 },
{ X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 },
{ X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 },
{ X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 },
{ X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 },
{ X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 },
{ X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 },
// AVX-512{F,VL} masked arithmetic instructions 256-bit
{ X86::VADDPDZ256rrkz, X86::VADDPDZ256rmkz, 0 },
{ X86::VADDPSZ256rrkz, X86::VADDPSZ256rmkz, 0 },
{ X86::VALIGNDZ256rrikz, X86::VALIGNDZ256rmikz, 0 },
{ X86::VALIGNQZ256rrikz, X86::VALIGNQZ256rmikz, 0 },
{ X86::VANDNPDZ256rrkz, X86::VANDNPDZ256rmkz, 0 },
{ X86::VANDNPSZ256rrkz, X86::VANDNPSZ256rmkz, 0 },
{ X86::VANDPDZ256rrkz, X86::VANDPDZ256rmkz, 0 },
{ X86::VANDPSZ256rrkz, X86::VANDPSZ256rmkz, 0 },
{ X86::VDIVPDZ256rrkz, X86::VDIVPDZ256rmkz, 0 },
{ X86::VDIVPSZ256rrkz, X86::VDIVPSZ256rmkz, 0 },
{ X86::VINSERTF32x4Z256rrkz, X86::VINSERTF32x4Z256rmkz, 0 },
{ X86::VINSERTF64x2Z256rrkz, X86::VINSERTF64x2Z256rmkz, 0 },
{ X86::VINSERTI32x4Z256rrkz, X86::VINSERTI32x4Z256rmkz, 0 },
{ X86::VINSERTI64x2Z256rrkz, X86::VINSERTI64x2Z256rmkz, 0 },
{ X86::VMAXCPDZ256rrkz, X86::VMAXCPDZ256rmkz, 0 },
{ X86::VMAXCPSZ256rrkz, X86::VMAXCPSZ256rmkz, 0 },
{ X86::VMAXPDZ256rrkz, X86::VMAXPDZ256rmkz, 0 },
{ X86::VMAXPSZ256rrkz, X86::VMAXPSZ256rmkz, 0 },
{ X86::VMINCPDZ256rrkz, X86::VMINCPDZ256rmkz, 0 },
{ X86::VMINCPSZ256rrkz, X86::VMINCPSZ256rmkz, 0 },
{ X86::VMINPDZ256rrkz, X86::VMINPDZ256rmkz, 0 },
{ X86::VMINPSZ256rrkz, X86::VMINPSZ256rmkz, 0 },
{ X86::VMULPDZ256rrkz, X86::VMULPDZ256rmkz, 0 },
{ X86::VMULPSZ256rrkz, X86::VMULPSZ256rmkz, 0 },
{ X86::VORPDZ256rrkz, X86::VORPDZ256rmkz, 0 },
{ X86::VORPSZ256rrkz, X86::VORPSZ256rmkz, 0 },
{ X86::VPADDBZ256rrkz, X86::VPADDBZ256rmkz, 0 },
{ X86::VPADDDZ256rrkz, X86::VPADDDZ256rmkz, 0 },
{ X86::VPADDQZ256rrkz, X86::VPADDQZ256rmkz, 0 },
{ X86::VPADDSBZ256rrkz, X86::VPADDSBZ256rmkz, 0 },
{ X86::VPADDSWZ256rrkz, X86::VPADDSWZ256rmkz, 0 },
{ X86::VPADDUSBZ256rrkz, X86::VPADDUSBZ256rmkz, 0 },
{ X86::VPADDUSWZ256rrkz, X86::VPADDUSWZ256rmkz, 0 },
{ X86::VPADDWZ256rrkz, X86::VPADDWZ256rmkz, 0 },
{ X86::VPALIGNRZ256rrikz, X86::VPALIGNRZ256rmikz, 0 },
{ X86::VPANDDZ256rrkz, X86::VPANDDZ256rmkz, 0 },
{ X86::VPANDNDZ256rrkz, X86::VPANDNDZ256rmkz, 0 },
{ X86::VPANDNQZ256rrkz, X86::VPANDNQZ256rmkz, 0 },
{ X86::VPANDQZ256rrkz, X86::VPANDQZ256rmkz, 0 },
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
{ X86::VPERMDZ256rrkz, X86::VPERMDZ256rmkz, 0 },
{ X86::VPERMILPDZ256rrkz, X86::VPERMILPDZ256rmkz, 0 },
{ X86::VPERMILPSZ256rrkz, X86::VPERMILPSZ256rmkz, 0 },
{ X86::VPERMPDZ256rrkz, X86::VPERMPDZ256rmkz, 0 },
{ X86::VPERMPSZ256rrkz, X86::VPERMPSZ256rmkz, 0 },
{ X86::VPERMQZ256rrkz, X86::VPERMQZ256rmkz, 0 },
{ X86::VPERMWZ256rrkz, X86::VPERMWZ256rmkz, 0 },
{ X86::VPMADDUBSWZ256rrkz, X86::VPMADDUBSWZ256rmkz, 0 },
{ X86::VPMADDWDZ256rrkz, X86::VPMADDWDZ256rmkz, 0 },
{ X86::VPORDZ256rrkz, X86::VPORDZ256rmkz, 0 },
{ X86::VPORQZ256rrkz, X86::VPORQZ256rmkz, 0 },
{ X86::VPSHUFBZ256rrkz, X86::VPSHUFBZ256rmkz, 0 },
{ X86::VPSUBBZ256rrkz, X86::VPSUBBZ256rmkz, 0 },
{ X86::VPSUBDZ256rrkz, X86::VPSUBDZ256rmkz, 0 },
{ X86::VPSUBQZ256rrkz, X86::VPSUBQZ256rmkz, 0 },
{ X86::VPSUBSBZ256rrkz, X86::VPSUBSBZ256rmkz, 0 },
{ X86::VPSUBSWZ256rrkz, X86::VPSUBSWZ256rmkz, 0 },
{ X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 },
{ X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 },
{ X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 },
{ X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 },
{ X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 },
{ X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 },
{ X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 },
{ X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 },
{ X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 },
{ X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 },
{ X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 },
{ X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 },
{ X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 },
{ X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 },
{ X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 },
{ X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 },
{ X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 },
{ X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 },
{ X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 },
{ X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 },
{ X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 },
// AVX-512{F,VL} masked arithmetic instructions 128-bit
{ X86::VADDPDZ128rrkz, X86::VADDPDZ128rmkz, 0 },
{ X86::VADDPSZ128rrkz, X86::VADDPSZ128rmkz, 0 },
{ X86::VALIGNDZ128rrikz, X86::VALIGNDZ128rmikz, 0 },
{ X86::VALIGNQZ128rrikz, X86::VALIGNQZ128rmikz, 0 },
{ X86::VANDNPDZ128rrkz, X86::VANDNPDZ128rmkz, 0 },
{ X86::VANDNPSZ128rrkz, X86::VANDNPSZ128rmkz, 0 },
{ X86::VANDPDZ128rrkz, X86::VANDPDZ128rmkz, 0 },
{ X86::VANDPSZ128rrkz, X86::VANDPSZ128rmkz, 0 },
{ X86::VDIVPDZ128rrkz, X86::VDIVPDZ128rmkz, 0 },
{ X86::VDIVPSZ128rrkz, X86::VDIVPSZ128rmkz, 0 },
{ X86::VMAXCPDZ128rrkz, X86::VMAXCPDZ128rmkz, 0 },
{ X86::VMAXCPSZ128rrkz, X86::VMAXCPSZ128rmkz, 0 },
{ X86::VMAXPDZ128rrkz, X86::VMAXPDZ128rmkz, 0 },
{ X86::VMAXPSZ128rrkz, X86::VMAXPSZ128rmkz, 0 },
{ X86::VMINCPDZ128rrkz, X86::VMINCPDZ128rmkz, 0 },
{ X86::VMINCPSZ128rrkz, X86::VMINCPSZ128rmkz, 0 },
{ X86::VMINPDZ128rrkz, X86::VMINPDZ128rmkz, 0 },
{ X86::VMINPSZ128rrkz, X86::VMINPSZ128rmkz, 0 },
{ X86::VMULPDZ128rrkz, X86::VMULPDZ128rmkz, 0 },
{ X86::VMULPSZ128rrkz, X86::VMULPSZ128rmkz, 0 },
{ X86::VORPDZ128rrkz, X86::VORPDZ128rmkz, 0 },
{ X86::VORPSZ128rrkz, X86::VORPSZ128rmkz, 0 },
{ X86::VPADDBZ128rrkz, X86::VPADDBZ128rmkz, 0 },
{ X86::VPADDDZ128rrkz, X86::VPADDDZ128rmkz, 0 },
{ X86::VPADDQZ128rrkz, X86::VPADDQZ128rmkz, 0 },
{ X86::VPADDSBZ128rrkz, X86::VPADDSBZ128rmkz, 0 },
{ X86::VPADDSWZ128rrkz, X86::VPADDSWZ128rmkz, 0 },
{ X86::VPADDUSBZ128rrkz, X86::VPADDUSBZ128rmkz, 0 },
{ X86::VPADDUSWZ128rrkz, X86::VPADDUSWZ128rmkz, 0 },
{ X86::VPADDWZ128rrkz, X86::VPADDWZ128rmkz, 0 },
{ X86::VPALIGNRZ128rrikz, X86::VPALIGNRZ128rmikz, 0 },
{ X86::VPANDDZ128rrkz, X86::VPANDDZ128rmkz, 0 },
{ X86::VPANDNDZ128rrkz, X86::VPANDNDZ128rmkz, 0 },
{ X86::VPANDNQZ128rrkz, X86::VPANDNQZ128rmkz, 0 },
{ X86::VPANDQZ128rrkz, X86::VPANDQZ128rmkz, 0 },
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
{ X86::VPERMILPDZ128rrkz, X86::VPERMILPDZ128rmkz, 0 },
{ X86::VPERMILPSZ128rrkz, X86::VPERMILPSZ128rmkz, 0 },
{ X86::VPERMWZ128rrkz, X86::VPERMWZ128rmkz, 0 },
{ X86::VPMADDUBSWZ128rrkz, X86::VPMADDUBSWZ128rmkz, 0 },
{ X86::VPMADDWDZ128rrkz, X86::VPMADDWDZ128rmkz, 0 },
{ X86::VPORDZ128rrkz, X86::VPORDZ128rmkz, 0 },
{ X86::VPORQZ128rrkz, X86::VPORQZ128rmkz, 0 },
{ X86::VPSHUFBZ128rrkz, X86::VPSHUFBZ128rmkz, 0 },
{ X86::VPSUBBZ128rrkz, X86::VPSUBBZ128rmkz, 0 },
{ X86::VPSUBDZ128rrkz, X86::VPSUBDZ128rmkz, 0 },
{ X86::VPSUBQZ128rrkz, X86::VPSUBQZ128rmkz, 0 },
{ X86::VPSUBSBZ128rrkz, X86::VPSUBSBZ128rmkz, 0 },
{ X86::VPSUBSWZ128rrkz, X86::VPSUBSWZ128rmkz, 0 },
{ X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 },
{ X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 },
{ X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 },
{ X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 },
{ X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 },
{ X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 },
{ X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 },
{ X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 },
{ X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 },
{ X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 },
{ X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 },
{ X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 },
{ X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 },
{ X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 },
{ X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 },
{ X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 },
{ X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 },
{ X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 },
{ X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 },
{ X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 },
{ X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 },
// AVX-512 masked foldable instructions
{ X86::VPERMILPDZrik, X86::VPERMILPDZmik, 0 },
{ X86::VPERMILPSZrik, X86::VPERMILPSZmik, 0 },
{ X86::VPERMPDZrik, X86::VPERMPDZmik, 0 },
{ X86::VPERMQZrik, X86::VPERMQZmik, 0 },
{ X86::VPMOVSXBDZrrk, X86::VPMOVSXBDZrmk, 0 },
{ X86::VPMOVSXBQZrrk, X86::VPMOVSXBQZrmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZrrk, X86::VPMOVSXBWZrmk, 0 },
{ X86::VPMOVSXDQZrrk, X86::VPMOVSXDQZrmk, 0 },
{ X86::VPMOVSXWDZrrk, X86::VPMOVSXWDZrmk, 0 },
{ X86::VPMOVSXWQZrrk, X86::VPMOVSXWQZrmk, 0 },
{ X86::VPMOVZXBDZrrk, X86::VPMOVZXBDZrmk, 0 },
{ X86::VPMOVZXBQZrrk, X86::VPMOVZXBQZrmk, TB_NO_REVERSE },
{ X86::VPMOVZXBWZrrk, X86::VPMOVZXBWZrmk, 0 },
{ X86::VPMOVZXDQZrrk, X86::VPMOVZXDQZrmk, 0 },
{ X86::VPMOVZXWDZrrk, X86::VPMOVZXWDZrmk, 0 },
{ X86::VPMOVZXWQZrrk, X86::VPMOVZXWQZrmk, 0 },
{ X86::VPSHUFDZrik, X86::VPSHUFDZmik, 0 },
{ X86::VPSHUFHWZrik, X86::VPSHUFHWZmik, 0 },
{ X86::VPSHUFLWZrik, X86::VPSHUFLWZmik, 0 },
// AVX-512VL 256-bit masked foldable instructions
{ X86::VPERMILPDZ256rik, X86::VPERMILPDZ256mik, 0 },
{ X86::VPERMILPSZ256rik, X86::VPERMILPSZ256mik, 0 },
{ X86::VPERMPDZ256rik, X86::VPERMPDZ256mik, 0 },
{ X86::VPERMQZ256rik, X86::VPERMQZ256mik, 0 },
{ X86::VPMOVSXBDZ256rrk, X86::VPMOVSXBDZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ256rrk, X86::VPMOVSXBQZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ256rrk, X86::VPMOVSXBWZ256rmk, 0 },
{ X86::VPMOVSXDQZ256rrk, X86::VPMOVSXDQZ256rmk, 0 },
{ X86::VPMOVSXWDZ256rrk, X86::VPMOVSXWDZ256rmk, 0 },
{ X86::VPMOVSXWQZ256rrk, X86::VPMOVSXWQZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ256rrk, X86::VPMOVZXBDZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ256rrk, X86::VPMOVZXBQZ256rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ256rrk, X86::VPMOVZXBWZ256rmk, 0 },
{ X86::VPMOVZXDQZ256rrk, X86::VPMOVZXDQZ256rmk, 0 },
{ X86::VPMOVZXWDZ256rrk, X86::VPMOVZXWDZ256rmk, 0 },
{ X86::VPMOVZXWQZ256rrk, X86::VPMOVZXWQZ256rmk, TB_NO_REVERSE },
{ X86::VPSHUFDZ256rik, X86::VPSHUFDZ256mik, 0 },
{ X86::VPSHUFHWZ256rik, X86::VPSHUFHWZ256mik, 0 },
{ X86::VPSHUFLWZ256rik, X86::VPSHUFLWZ256mik, 0 },
// AVX-512VL 128-bit masked foldable instructions
{ X86::VPERMILPDZ128rik, X86::VPERMILPDZ128mik, 0 },
{ X86::VPERMILPSZ128rik, X86::VPERMILPSZ128mik, 0 },
{ X86::VPMOVSXBDZ128rrk, X86::VPMOVSXBDZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBQZ128rrk, X86::VPMOVSXBQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXBWZ128rrk, X86::VPMOVSXBWZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXDQZ128rrk, X86::VPMOVSXDQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXWDZ128rrk, X86::VPMOVSXWDZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVSXWQZ128rrk, X86::VPMOVSXWQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBDZ128rrk, X86::VPMOVZXBDZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBQZ128rrk, X86::VPMOVZXBQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXBWZ128rrk, X86::VPMOVZXBWZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXDQZ128rrk, X86::VPMOVZXDQZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXWDZ128rrk, X86::VPMOVZXWDZ128rmk, TB_NO_REVERSE },
{ X86::VPMOVZXWQZ128rrk, X86::VPMOVZXWQZ128rmk, TB_NO_REVERSE },
{ X86::VPSHUFDZ128rik, X86::VPSHUFDZ128mik, 0 },
{ X86::VPSHUFHWZ128rik, X86::VPSHUFHWZ128mik, 0 },
{ X86::VPSHUFLWZ128rik, X86::VPSHUFLWZ128mik, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 3, folded load
Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
}
auto I = X86InstrFMA3Info::rm_begin();
auto E = X86InstrFMA3Info::rm_end();
for (; I != E; ++I) {
if (!I.getGroup()->isKMasked()) {
// Intrinsic forms need to pass TB_NO_REVERSE.
if (I.getGroup()->isIntrinsic()) {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
I.getRegOpcode(), I.getMemOpcode(),
TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
} else {
AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
I.getRegOpcode(), I.getMemOpcode(),
TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
}
}
}
static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
// AVX-512 foldable masked instructions
{ X86::VADDPDZrrk, X86::VADDPDZrmk, 0 },
{ X86::VADDPSZrrk, X86::VADDPSZrmk, 0 },
{ X86::VALIGNDZrrik, X86::VALIGNDZrmik, 0 },
{ X86::VALIGNQZrrik, X86::VALIGNQZrmik, 0 },
{ X86::VANDNPDZrrk, X86::VANDNPDZrmk, 0 },
{ X86::VANDNPSZrrk, X86::VANDNPSZrmk, 0 },
{ X86::VANDPDZrrk, X86::VANDPDZrmk, 0 },
{ X86::VANDPSZrrk, X86::VANDPSZrmk, 0 },
{ X86::VDIVPDZrrk, X86::VDIVPDZrmk, 0 },
{ X86::VDIVPSZrrk, X86::VDIVPSZrmk, 0 },
{ X86::VINSERTF32x4Zrrk, X86::VINSERTF32x4Zrmk, 0 },
{ X86::VINSERTF32x8Zrrk, X86::VINSERTF32x8Zrmk, 0 },
{ X86::VINSERTF64x2Zrrk, X86::VINSERTF64x2Zrmk, 0 },
{ X86::VINSERTF64x4Zrrk, X86::VINSERTF64x4Zrmk, 0 },
{ X86::VINSERTI32x4Zrrk, X86::VINSERTI32x4Zrmk, 0 },
{ X86::VINSERTI32x8Zrrk, X86::VINSERTI32x8Zrmk, 0 },
{ X86::VINSERTI64x2Zrrk, X86::VINSERTI64x2Zrmk, 0 },
{ X86::VINSERTI64x4Zrrk, X86::VINSERTI64x4Zrmk, 0 },
{ X86::VMAXCPDZrrk, X86::VMAXCPDZrmk, 0 },
{ X86::VMAXCPSZrrk, X86::VMAXCPSZrmk, 0 },
{ X86::VMAXPDZrrk, X86::VMAXPDZrmk, 0 },
{ X86::VMAXPSZrrk, X86::VMAXPSZrmk, 0 },
{ X86::VMINCPDZrrk, X86::VMINCPDZrmk, 0 },
{ X86::VMINCPSZrrk, X86::VMINCPSZrmk, 0 },
{ X86::VMINPDZrrk, X86::VMINPDZrmk, 0 },
{ X86::VMINPSZrrk, X86::VMINPSZrmk, 0 },
{ X86::VMULPDZrrk, X86::VMULPDZrmk, 0 },
{ X86::VMULPSZrrk, X86::VMULPSZrmk, 0 },
{ X86::VORPDZrrk, X86::VORPDZrmk, 0 },
{ X86::VORPSZrrk, X86::VORPSZrmk, 0 },
{ X86::VPADDBZrrk, X86::VPADDBZrmk, 0 },
{ X86::VPADDDZrrk, X86::VPADDDZrmk, 0 },
{ X86::VPADDQZrrk, X86::VPADDQZrmk, 0 },
{ X86::VPADDSBZrrk, X86::VPADDSBZrmk, 0 },
{ X86::VPADDSWZrrk, X86::VPADDSWZrmk, 0 },
{ X86::VPADDUSBZrrk, X86::VPADDUSBZrmk, 0 },
{ X86::VPADDUSWZrrk, X86::VPADDUSWZrmk, 0 },
{ X86::VPADDWZrrk, X86::VPADDWZrmk, 0 },
{ X86::VPALIGNRZrrik, X86::VPALIGNRZrmik, 0 },
{ X86::VPANDDZrrk, X86::VPANDDZrmk, 0 },
{ X86::VPANDNDZrrk, X86::VPANDNDZrmk, 0 },
{ X86::VPANDNQZrrk, X86::VPANDNQZrmk, 0 },
{ X86::VPANDQZrrk, X86::VPANDQZrmk, 0 },
{ X86::VPERMBZrrk, X86::VPERMBZrmk, 0 },
{ X86::VPERMDZrrk, X86::VPERMDZrmk, 0 },
{ X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 },
{ X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 },
{ X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 },
{ X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 },
{ X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 },
{ X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 },
{ X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 },
{ X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 },
{ X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 },
{ X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 },
{ X86::VPERMQZrrk, X86::VPERMQZrmk, 0 },
{ X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 },
{ X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 },
{ X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 },
{ X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 },
{ X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
{ X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
{ X86::VPORDZrrk, X86::VPORDZrmk, 0 },
{ X86::VPORQZrrk, X86::VPORQZrmk, 0 },
{ X86::VPSHUFBZrrk, X86::VPSHUFBZrmk, 0 },
{ X86::VPSUBBZrrk, X86::VPSUBBZrmk, 0 },
{ X86::VPSUBDZrrk, X86::VPSUBDZrmk, 0 },
{ X86::VPSUBQZrrk, X86::VPSUBQZrmk, 0 },
{ X86::VPSUBSBZrrk, X86::VPSUBSBZrmk, 0 },
{ X86::VPSUBSWZrrk, X86::VPSUBSWZrmk, 0 },
{ X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 },
{ X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 },
{ X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 },
{ X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 },
{ X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 },
{ X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 },
{ X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 },
{ X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 },
{ X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 },
{ X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 },
{ X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 },
{ X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 },
{ X86::VPXORDZrrk, X86::VPXORDZrmk, 0 },
{ X86::VPXORQZrrk, X86::VPXORQZrmk, 0 },
{ X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 },
{ X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 },
{ X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 },
{ X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 },
{ X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 },
{ X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 },
{ X86::VXORPDZrrk, X86::VXORPDZrmk, 0 },
{ X86::VXORPSZrrk, X86::VXORPSZrmk, 0 },
// AVX-512{F,VL} foldable masked instructions 256-bit
{ X86::VADDPDZ256rrk, X86::VADDPDZ256rmk, 0 },
{ X86::VADDPSZ256rrk, X86::VADDPSZ256rmk, 0 },
{ X86::VALIGNDZ256rrik, X86::VALIGNDZ256rmik, 0 },
{ X86::VALIGNQZ256rrik, X86::VALIGNQZ256rmik, 0 },
{ X86::VANDNPDZ256rrk, X86::VANDNPDZ256rmk, 0 },
{ X86::VANDNPSZ256rrk, X86::VANDNPSZ256rmk, 0 },
{ X86::VANDPDZ256rrk, X86::VANDPDZ256rmk, 0 },
{ X86::VANDPSZ256rrk, X86::VANDPSZ256rmk, 0 },
{ X86::VDIVPDZ256rrk, X86::VDIVPDZ256rmk, 0 },
{ X86::VDIVPSZ256rrk, X86::VDIVPSZ256rmk, 0 },
{ X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk, 0 },
{ X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk, 0 },
{ X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk, 0 },
{ X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk, 0 },
{ X86::VMAXCPDZ256rrk, X86::VMAXCPDZ256rmk, 0 },
{ X86::VMAXCPSZ256rrk, X86::VMAXCPSZ256rmk, 0 },
{ X86::VMAXPDZ256rrk, X86::VMAXPDZ256rmk, 0 },
{ X86::VMAXPSZ256rrk, X86::VMAXPSZ256rmk, 0 },
{ X86::VMINCPDZ256rrk, X86::VMINCPDZ256rmk, 0 },
{ X86::VMINCPSZ256rrk, X86::VMINCPSZ256rmk, 0 },
{ X86::VMINPDZ256rrk, X86::VMINPDZ256rmk, 0 },
{ X86::VMINPSZ256rrk, X86::VMINPSZ256rmk, 0 },
{ X86::VMULPDZ256rrk, X86::VMULPDZ256rmk, 0 },
{ X86::VMULPSZ256rrk, X86::VMULPSZ256rmk, 0 },
{ X86::VORPDZ256rrk, X86::VORPDZ256rmk, 0 },
{ X86::VORPSZ256rrk, X86::VORPSZ256rmk, 0 },
{ X86::VPADDBZ256rrk, X86::VPADDBZ256rmk, 0 },
{ X86::VPADDDZ256rrk, X86::VPADDDZ256rmk, 0 },
{ X86::VPADDQZ256rrk, X86::VPADDQZ256rmk, 0 },
{ X86::VPADDSBZ256rrk, X86::VPADDSBZ256rmk, 0 },
{ X86::VPADDSWZ256rrk, X86::VPADDSWZ256rmk, 0 },
{ X86::VPADDUSBZ256rrk, X86::VPADDUSBZ256rmk, 0 },
{ X86::VPADDUSWZ256rrk, X86::VPADDUSWZ256rmk, 0 },
{ X86::VPADDWZ256rrk, X86::VPADDWZ256rmk, 0 },
{ X86::VPALIGNRZ256rrik, X86::VPALIGNRZ256rmik, 0 },
{ X86::VPANDDZ256rrk, X86::VPANDDZ256rmk, 0 },
{ X86::VPANDNDZ256rrk, X86::VPANDNDZ256rmk, 0 },
{ X86::VPANDNQZ256rrk, X86::VPANDNQZ256rmk, 0 },
{ X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 },
{ X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 },
{ X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 },
{ X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 },
{ X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 },
{ X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 },
{ X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 },
{ X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 },
{ X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 },
{ X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 },
{ X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 },
{ X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 },
{ X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 },
{ X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 },
{ X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 },
{ X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 },
{ X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 },
{ X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 },
{ X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
{ X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
{ X86::VPORDZ256rrk, X86::VPORDZ256rmk, 0 },
{ X86::VPORQZ256rrk, X86::VPORQZ256rmk, 0 },
{ X86::VPSHUFBZ256rrk, X86::VPSHUFBZ256rmk, 0 },
{ X86::VPSUBBZ256rrk, X86::VPSUBBZ256rmk, 0 },
{ X86::VPSUBDZ256rrk, X86::VPSUBDZ256rmk, 0 },
{ X86::VPSUBQZ256rrk, X86::VPSUBQZ256rmk, 0 },
{ X86::VPSUBSBZ256rrk, X86::VPSUBSBZ256rmk, 0 },
{ X86::VPSUBSWZ256rrk, X86::VPSUBSWZ256rmk, 0 },
{ X86::VPSUBUSBZ256rrk, X86::VPSUBUSBZ256rmk, 0 },
{ X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 },
{ X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 },
{ X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 },
{ X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 },
{ X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 },
{ X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 },
{ X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 },
{ X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 },
{ X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 },
{ X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 },
{ X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 },
{ X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 },
{ X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 },
{ X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 },
{ X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 },
{ X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 },
{ X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 },
{ X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 },
{ X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 },
{ X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 },
{ X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 },
{ X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 },
// AVX-512{F,VL} foldable instructions 128-bit
{ X86::VADDPDZ128rrk, X86::VADDPDZ128rmk, 0 },
{ X86::VADDPSZ128rrk, X86::VADDPSZ128rmk, 0 },
{ X86::VALIGNDZ128rrik, X86::VALIGNDZ128rmik, 0 },
{ X86::VALIGNQZ128rrik, X86::VALIGNQZ128rmik, 0 },
{ X86::VANDNPDZ128rrk, X86::VANDNPDZ128rmk, 0 },
{ X86::VANDNPSZ128rrk, X86::VANDNPSZ128rmk, 0 },
{ X86::VANDPDZ128rrk, X86::VANDPDZ128rmk, 0 },
{ X86::VANDPSZ128rrk, X86::VANDPSZ128rmk, 0 },
{ X86::VDIVPDZ128rrk, X86::VDIVPDZ128rmk, 0 },
{ X86::VDIVPSZ128rrk, X86::VDIVPSZ128rmk, 0 },
{ X86::VMAXCPDZ128rrk, X86::VMAXCPDZ128rmk, 0 },
{ X86::VMAXCPSZ128rrk, X86::VMAXCPSZ128rmk, 0 },
{ X86::VMAXPDZ128rrk, X86::VMAXPDZ128rmk, 0 },
{ X86::VMAXPSZ128rrk, X86::VMAXPSZ128rmk, 0 },
{ X86::VMINCPDZ128rrk, X86::VMINCPDZ128rmk, 0 },
{ X86::VMINCPSZ128rrk, X86::VMINCPSZ128rmk, 0 },
{ X86::VMINPDZ128rrk, X86::VMINPDZ128rmk, 0 },
{ X86::VMINPSZ128rrk, X86::VMINPSZ128rmk, 0 },
{ X86::VMULPDZ128rrk, X86::VMULPDZ128rmk, 0 },
{ X86::VMULPSZ128rrk, X86::VMULPSZ128rmk, 0 },
{ X86::VORPDZ128rrk, X86::VORPDZ128rmk, 0 },
{ X86::VORPSZ128rrk, X86::VORPSZ128rmk, 0 },
{ X86::VPADDBZ128rrk, X86::VPADDBZ128rmk, 0 },
{ X86::VPADDDZ128rrk, X86::VPADDDZ128rmk, 0 },
{ X86::VPADDQZ128rrk, X86::VPADDQZ128rmk, 0 },
{ X86::VPADDSBZ128rrk, X86::VPADDSBZ128rmk, 0 },
{ X86::VPADDSWZ128rrk, X86::VPADDSWZ128rmk, 0 },
{ X86::VPADDUSBZ128rrk, X86::VPADDUSBZ128rmk, 0 },
{ X86::VPADDUSWZ128rrk, X86::VPADDUSWZ128rmk, 0 },
{ X86::VPADDWZ128rrk, X86::VPADDWZ128rmk, 0 },
{ X86::VPALIGNRZ128rrik, X86::VPALIGNRZ128rmik, 0 },
{ X86::VPANDDZ128rrk, X86::VPANDDZ128rmk, 0 },
{ X86::VPANDNDZ128rrk, X86::VPANDNDZ128rmk, 0 },
{ X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 },
{ X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 },
{ X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 },
{ X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 },
{ X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 },
{ X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 },
{ X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 },
{ X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 },
{ X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 },
{ X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 },
{ X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 },
{ X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 },
{ X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 },
{ X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 },
{ X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 },
{ X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
{ X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
{ X86::VPORDZ128rrk, X86::VPORDZ128rmk, 0 },
{ X86::VPORQZ128rrk, X86::VPORQZ128rmk, 0 },
{ X86::VPSHUFBZ128rrk, X86::VPSHUFBZ128rmk, 0 },
{ X86::VPSUBBZ128rrk, X86::VPSUBBZ128rmk, 0 },
{ X86::VPSUBDZ128rrk, X86::VPSUBDZ128rmk, 0 },
{ X86::VPSUBQZ128rrk, X86::VPSUBQZ128rmk, 0 },
{ X86::VPSUBSBZ128rrk, X86::VPSUBSBZ128rmk, 0 },
{ X86::VPSUBSWZ128rrk, X86::VPSUBSWZ128rmk, 0 },
{ X86::VPSUBUSBZ128rrk, X86::VPSUBUSBZ128rmk, 0 },
{ X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 },
{ X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 },
{ X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 },
{ X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 },
{ X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 },
{ X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 },
{ X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 },
{ X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 },
{ X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 },
{ X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 },
{ X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 },
{ X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 },
{ X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 },
{ X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 },
{ X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 },
{ X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 },
{ X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 },
{ X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 },
{ X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 },
{ X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 },
{ X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 },
{ X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 },
// 512-bit three source instructions with zero masking.
{ X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 },
{ X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 },
{ X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 },
{ X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 },
{ X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 },
{ X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 },
{ X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 },
{ X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 },
{ X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 },
{ X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
{ X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
{ X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
{ X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
{ X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
// 256-bit three source instructions with zero masking.
{ X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 },
{ X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 },
{ X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 },
{ X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 },
{ X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 },
{ X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 },
{ X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 },
{ X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 },
{ X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 },
{ X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
{ X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
{ X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
{ X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
// 128-bit three source instructions with zero masking.
{ X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 },
{ X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 },
{ X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 },
{ X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 },
{ X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 },
{ X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 },
{ X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 },
{ X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 },
{ X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 },
{ X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
{ X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
{ X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
{ X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
};
for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
Entry.RegOp, Entry.MemOp,
// Index 4, folded load
Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
}
for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
if (I.getGroup()->isKMasked()) {
// Intrinsics need to pass TB_NO_REVERSE.
if (I.getGroup()->isIntrinsic()) {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
I.getRegOpcode(), I.getMemOpcode(),
TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
} else {
AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
I.getRegOpcode(), I.getMemOpcode(),
TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
}
}
}
}
void
X86InstrInfo::AddTableEntry(RegOp2MemOpTableType &R2MTable,
MemOp2RegOpTableType &M2RTable,
uint16_t RegOp, uint16_t MemOp, uint16_t Flags) {
if ((Flags & TB_NO_FORWARD) == 0) {
assert(!R2MTable.count(RegOp) && "Duplicate entry!");
R2MTable[RegOp] = std::make_pair(MemOp, Flags);
}
if ((Flags & TB_NO_REVERSE) == 0) {
assert(!M2RTable.count(MemOp) &&
"Duplicated entries in unfolding maps?");
M2RTable[MemOp] = std::make_pair(RegOp, Flags);
}
}
bool
X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const {
switch (MI.getOpcode()) {
default: break;
case X86::MOVSX16rr8:
case X86::MOVZX16rr8:
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
if (!Subtarget.is64Bit())
// It's not always legal to reference the low 8-bit of the larger
// register in 32-bit mode.
return false;
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
case X86::MOVSX64rr32: {
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
// Be conservative.
return false;
SrcReg = MI.getOperand(1).getReg();
DstReg = MI.getOperand(0).getReg();
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::MOVSX16rr8:
case X86::MOVZX16rr8:
case X86::MOVSX32rr8:
case X86::MOVZX32rr8:
case X86::MOVSX64rr8:
SubIdx = X86::sub_8bit;
break;
case X86::MOVSX32rr16:
case X86::MOVZX32rr16:
case X86::MOVSX64rr16:
SubIdx = X86::sub_16bit;
break;
case X86::MOVSX64rr32:
SubIdx = X86::sub_32bit;
break;
}
return true;
}
}
return false;
}
int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
const MachineFunction *MF = MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
if (MI.getOpcode() == getCallFrameSetupOpcode() ||
MI.getOpcode() == getCallFrameDestroyOpcode()) {
unsigned StackAlign = TFI->getStackAlignment();
int SPAdj =
(MI.getOperand(0).getImm() + StackAlign - 1) / StackAlign * StackAlign;
SPAdj -= MI.getOperand(1).getImm();
if (MI.getOpcode() == getCallFrameSetupOpcode())
return SPAdj;
else
return -SPAdj;
}
// To know whether a call adjusts the stack, we need information
// that is bound to the following ADJCALLSTACKUP pseudo.
// Look for the next ADJCALLSTACKUP that follows the call.
if (MI.isCall()) {
const MachineBasicBlock *MBB = MI.getParent();
auto I = ++MachineBasicBlock::const_iterator(MI);
for (auto E = MBB->end(); I != E; ++I) {
if (I->getOpcode() == getCallFrameDestroyOpcode() ||
I->isCall())
break;
}
// If we could not find a frame destroy opcode, then it has already
// been simplified, so we don't care.
if (I->getOpcode() != getCallFrameDestroyOpcode())
return 0;
return -(I->getOperand(1).getImm());
}
// Currently handle only PUSHes we can reasonably expect to see
// in call sequences
switch (MI.getOpcode()) {
default:
return 0;
case X86::PUSH32i8:
case X86::PUSH32r:
case X86::PUSH32rmm:
case X86::PUSH32rmr:
case X86::PUSHi32:
return 4;
case X86::PUSH64i8:
case X86::PUSH64r:
case X86::PUSH64rmm:
case X86::PUSH64rmr:
case X86::PUSH64i32:
return 8;
}
}
/// Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const {
if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
MI.getOperand(Op + X86::AddrScaleAmt).isImm() &&
MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
MI.getOperand(Op + X86::AddrDisp).isImm() &&
MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1 &&
MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0 &&
MI.getOperand(Op + X86::AddrDisp).getImm() == 0) {
FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
return true;
}
return false;
}
static bool isFrameLoadOpcode(int Opcode) {
switch (Opcode) {
default:
return false;
case X86::MOV8rm:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::LD_Fp64m:
case X86::MOVSSrm:
case X86::MOVSDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
case X86::VMOVSDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPDYrm:
case X86::VMOVAPDYrm:
case X86::VMOVDQUYrm:
case X86::VMOVDQAYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::VMOVSSZrm:
case X86::VMOVSDZrm:
case X86::VMOVAPSZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZrm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU64Zrm:
case X86::VMOVDQU64Z128rm:
case X86::VMOVDQU64Z256rm:
case X86::VMOVDQU8Zrm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQU16Z256rm:
case X86::KMOVBkm:
case X86::KMOVWkm:
case X86::KMOVDkm:
case X86::KMOVQkm:
return true;
}
}
static bool isFrameStoreOpcode(int Opcode) {
switch (Opcode) {
default: break;
case X86::MOV8mr:
case X86::MOV16mr:
case X86::MOV32mr:
case X86::MOV64mr:
case X86::ST_FpP64m:
case X86::MOVSSmr:
case X86::MOVSDmr:
case X86::MOVAPSmr:
case X86::MOVUPSmr:
case X86::MOVAPDmr:
case X86::MOVUPDmr:
case X86::MOVDQAmr:
case X86::MOVDQUmr:
case X86::VMOVSSmr:
case X86::VMOVSDmr:
case X86::VMOVAPSmr:
case X86::VMOVUPSmr:
case X86::VMOVAPDmr:
case X86::VMOVUPDmr:
case X86::VMOVDQAmr:
case X86::VMOVDQUmr:
case X86::VMOVUPSYmr:
case X86::VMOVAPSYmr:
case X86::VMOVUPDYmr:
case X86::VMOVAPDYmr:
case X86::VMOVDQUYmr:
case X86::VMOVDQAYmr:
case X86::VMOVSSZmr:
case X86::VMOVSDZmr:
case X86::VMOVUPSZmr:
case X86::VMOVUPSZ128mr:
case X86::VMOVUPSZ256mr:
case X86::VMOVUPSZ128mr_NOVLX:
case X86::VMOVUPSZ256mr_NOVLX:
case X86::VMOVAPSZmr:
case X86::VMOVAPSZ128mr:
case X86::VMOVAPSZ256mr:
case X86::VMOVAPSZ128mr_NOVLX:
case X86::VMOVAPSZ256mr_NOVLX:
case X86::VMOVUPDZmr:
case X86::VMOVUPDZ128mr:
case X86::VMOVUPDZ256mr:
case X86::VMOVAPDZmr:
case X86::VMOVAPDZ128mr:
case X86::VMOVAPDZ256mr:
case X86::VMOVDQA32Zmr:
case X86::VMOVDQA32Z128mr:
case X86::VMOVDQA32Z256mr:
case X86::VMOVDQU32Zmr:
case X86::VMOVDQU32Z128mr:
case X86::VMOVDQU32Z256mr:
case X86::VMOVDQA64Zmr:
case X86::VMOVDQA64Z128mr:
case X86::VMOVDQA64Z256mr:
case X86::VMOVDQU64Zmr:
case X86::VMOVDQU64Z128mr:
case X86::VMOVDQU64Z256mr:
case X86::VMOVDQU8Zmr:
case X86::VMOVDQU8Z128mr:
case X86::VMOVDQU8Z256mr:
case X86::VMOVDQU16Zmr:
case X86::VMOVDQU16Z128mr:
case X86::VMOVDQU16Z256mr:
case X86::MMX_MOVD64mr:
case X86::MMX_MOVQ64mr:
case X86::MMX_MOVNTQmr:
case X86::KMOVBmk:
case X86::KMOVWmk:
case X86::KMOVDmk:
case X86::KMOVQmk:
return true;
}
return false;
}
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (isFrameLoadOpcode(MI.getOpcode()))
if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))
return MI.getOperand(0).getReg();
return 0;
}
unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
if (isFrameLoadOpcode(MI.getOpcode())) {
unsigned Reg;
if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
return Reg;
// Check for post-frame index elimination operations
const MachineMemOperand *Dummy;
return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
}
return 0;
}
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const {
if (isFrameStoreOpcode(MI.getOpcode()))
if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
isFrameOperand(MI, 0, FrameIndex))
return MI.getOperand(X86::AddrNumOperands).getReg();
return 0;
}
unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const {
if (isFrameStoreOpcode(MI.getOpcode())) {
unsigned Reg;
if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
return Reg;
// Check for post-frame index elimination operations
const MachineMemOperand *Dummy;
return hasStoreToStackSlot(MI, Dummy, FrameIndex);
}
return 0;
}
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
// Don't waste compile time scanning use-def chains of physregs.
if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
return false;
bool isPICBase = false;
for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
E = MRI.def_instr_end(); I != E; ++I) {
MachineInstr *DefMI = &*I;
if (DefMI->getOpcode() != X86::MOVPC32r)
return false;
assert(!isPICBase && "More than one PIC base?");
isPICBase = true;
}
return isPICBase;
}
bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
AliasAnalysis *AA) const {
switch (MI.getOpcode()) {
default: break;
case X86::MOV8rm:
case X86::MOV8rm_NOREX:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::LD_Fp64m:
case X86::MOVSSrm:
case X86::MOVSDrm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
case X86::VMOVSSrm:
case X86::VMOVSDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
// AVX-512
case X86::VMOVSSZrm:
case X86::VMOVSDZrm:
case X86::VMOVAPDZ128rm:
case X86::VMOVAPDZ256rm:
case X86::VMOVAPDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVAPSZrm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQU16Z256rm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQU64Z128rm:
case X86::VMOVDQU64Z256rm:
case X86::VMOVDQU64Zrm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU8Zrm:
case X86::VMOVUPDZ128rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVUPDZrm:
case X86::VMOVUPSZ128rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVUPSZrm: {
// Loads from constant pools are trivially rematerializable.
if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
MI.isDereferenceableInvariantLoad(AA)) {
unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0 || BaseReg == X86::RIP)
return true;
// Allow re-materialization of PIC load.
if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
return false;
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return regIsPICBase(BaseReg, MRI);
}
return false;
}
case X86::LEA32r:
case X86::LEA64r: {
if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
!MI.getOperand(1 + X86::AddrDisp).isReg()) {
// lea fi#, lea GV, etc. are all rematerializable.
if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
return true;
unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
if (BaseReg == 0)
return true;
// Allow re-materialization of lea PICBase + x.
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
return regIsPICBase(BaseReg, MRI);
}
return false;
}
}
// All other instructions marked M_REMATERIALIZABLE are always trivially
// rematerializable.
return true;
}
bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const {
MachineBasicBlock::iterator E = MBB.end();
// For compile time consideration, if we are not able to determine the
// safety after visiting 4 instructions in each direction, we will assume
// it's not safe.
MachineBasicBlock::iterator Iter = I;
for (unsigned i = 0; Iter != E && i < 4; ++i) {
bool SeenDef = false;
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
MachineOperand &MO = Iter->getOperand(j);
if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
SeenDef = true;
if (!MO.isReg())
continue;
if (MO.getReg() == X86::EFLAGS) {
if (MO.isUse())
return false;
SeenDef = true;
}
}
if (SeenDef)
// This instruction defines EFLAGS, no need to look any further.
return true;
++Iter;
// Skip over DBG_VALUE.
while (Iter != E && Iter->isDebugValue())
++Iter;
}
// It is safe to clobber EFLAGS at the end of a block of no successor has it
// live in.
if (Iter == E) {
for (MachineBasicBlock *S : MBB.successors())
if (S->isLiveIn(X86::EFLAGS))
return false;
return true;
}
MachineBasicBlock::iterator B = MBB.begin();
Iter = I;
for (unsigned i = 0; i < 4; ++i) {
// If we make it to the beginning of the block, it's safe to clobber
// EFLAGS iff EFLAGS is not live-in.
if (Iter == B)
return !MBB.isLiveIn(X86::EFLAGS);
--Iter;
// Skip over DBG_VALUE.
while (Iter != B && Iter->isDebugValue())
--Iter;
bool SawKill = false;
for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
MachineOperand &MO = Iter->getOperand(j);
// A register mask may clobber EFLAGS, but we should still look for a
// live EFLAGS def.
if (MO.isRegMask() && MO.clobbersPhysReg(X86::EFLAGS))
SawKill = true;
if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
if (MO.isDef()) return MO.isDead();
if (MO.isKill()) SawKill = true;
}
}
if (SawKill)
// This instruction kills EFLAGS and doesn't redefine it, so
// there's no need to look further.
return true;
}
// Conservative answer.
return false;
}
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const {
bool ClobbersEFLAGS = false;
for (const MachineOperand &MO : Orig.operands()) {
if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
ClobbersEFLAGS = true;
break;
}
}
if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) {
// The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
// effects.
int Value;
switch (Orig.getOpcode()) {
case X86::MOV32r0: Value = 0; break;
case X86::MOV32r1: Value = 1; break;
case X86::MOV32r_1: Value = -1; break;
default:
llvm_unreachable("Unexpected instruction!");
}
const DebugLoc &DL = Orig.getDebugLoc();
BuildMI(MBB, I, DL, get(X86::MOV32ri))
.addOperand(Orig.getOperand(0))
.addImm(Value);
} else {
MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
MBB.insert(I, MI);
}
MachineInstr &NewMI = *std::prev(I);
NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
}
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (MO.isReg() && MO.isDef() &&
MO.getReg() == X86::EFLAGS && !MO.isDead()) {
return true;
}
}
return false;
}
/// Check whether the shift count for a machine operand is non-zero.
inline static unsigned getTruncatedShiftCount(MachineInstr &MI,
unsigned ShiftAmtOperandIdx) {
// The shift count is six bits with the REX.W prefix and five bits without.
unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 63 : 31;
unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
return Imm & ShiftCountMask;
}
/// Check whether the given shift count is appropriate
/// can be represented by a LEA instruction.
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
// Left shift instructions can be transformed into load-effective-address
// instructions if we can encode them appropriately.
// A LEA instruction utilizes a SIB byte to encode its scale factor.
// The SIB.scale field is two bits wide which means that we can encode any
// shift amount less than 4.
return ShAmt < 4 && ShAmt > 0;
}
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned Opc, bool AllowSP, unsigned &NewSrc,
bool &isKill, bool &isUndef,
MachineOperand &ImplicitOp,
LiveVariables *LV) const {
MachineFunction &MF = *MI.getParent()->getParent();
const TargetRegisterClass *RC;
if (AllowSP) {
RC = Opc != X86::LEA32r ? &X86::GR64RegClass : &X86::GR32RegClass;
} else {
RC = Opc != X86::LEA32r ?
&X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
}
unsigned SrcReg = Src.getReg();
// For both LEA64 and LEA32 the register already has essentially the right
// type (32-bit or 64-bit) we may just need to forbid SP.
if (Opc != X86::LEA64_32r) {
NewSrc = SrcReg;
isKill = Src.isKill();
isUndef = Src.isUndef();
if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
!MF.getRegInfo().constrainRegClass(NewSrc, RC))
return false;
return true;
}
// This is for an LEA64_32r and incoming registers are 32-bit. One way or
// another we need to add 64-bit registers to the final MI.
if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
ImplicitOp = Src;
ImplicitOp.setImplicit();
NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
isKill = Src.isKill();
isUndef = Src.isUndef();
} else {
// Virtual register of the wrong class, we have to create a temporary 64-bit
// vreg to feed into the LEA.
NewSrc = MF.getRegInfo().createVirtualRegister(RC);
MachineInstr *Copy = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
get(TargetOpcode::COPY))
.addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
.addOperand(Src);
// Which is obviously going to be dead after we're done with it.
isKill = true;
isUndef = false;
if (LV)
LV->replaceKillInstruction(SrcReg, MI, *Copy);
}
// We've set all the parameters without issue.
return true;
}
/// Helper for convertToThreeAddress when 16-bit LEA is disabled, use 32-bit
/// LEA to form 3-address code by promoting to a 32-bit superregister and then
/// truncating back down to a 16-bit subregister.
MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
LiveVariables *LV) const {
MachineBasicBlock::iterator MBBI = MI.getIterator();
unsigned Dest = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
bool isDead = MI.getOperand(0).isDead();
bool isKill = MI.getOperand(1).isKill();
MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
unsigned Opc, leaInReg;
if (Subtarget.is64Bit()) {
Opc = X86::LEA64_32r;
leaInReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
} else {
Opc = X86::LEA32r;
leaInReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
}
// Build and insert into an implicit UNDEF value. This is OK because
// well be shifting and then extracting the lower 16-bits.
// This has the potential to cause partial register stall. e.g.
// movw (%rbp,%rcx,2), %dx
// leal -65(%rdx), %esi
// But testing has shown this *does* help performance in 64-bit mode (at
// least on modern x86 machines).
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
MachineInstr *InsMI =
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(leaInReg, RegState::Define, X86::sub_16bit)
.addReg(Src, getKillRegState(isKill));
MachineInstrBuilder MIB =
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opc), leaOutReg);
switch (MIOpc) {
default: llvm_unreachable("Unreachable!");
case X86::SHL16ri: {
unsigned ShAmt = MI.getOperand(2).getImm();
MIB.addReg(0).addImm(1ULL << ShAmt)
.addReg(leaInReg, RegState::Kill).addImm(0).addReg(0);
break;
}
case X86::INC16r:
addRegOffset(MIB, leaInReg, true, 1);
break;
case X86::DEC16r:
addRegOffset(MIB, leaInReg, true, -1);
break;
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
addRegOffset(MIB, leaInReg, true, MI.getOperand(2).getImm());
break;
case X86::ADD16rr:
case X86::ADD16rr_DB: {
unsigned Src2 = MI.getOperand(2).getReg();
bool isKill2 = MI.getOperand(2).isKill();
unsigned leaInReg2 = 0;
MachineInstr *InsMI2 = nullptr;
if (Src == Src2) {
// ADD16rr %reg1028<kill>, %reg1028
// just a single insert_subreg.
addRegReg(MIB, leaInReg, true, leaInReg, false);
} else {
if (Subtarget.is64Bit())
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
else
leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
// Build and insert into an implicit UNDEF value. This is OK because
// well be shifting and then extracting the lower 16-bits.
BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(leaInReg2, RegState::Define, X86::sub_16bit)
.addReg(Src2, getKillRegState(isKill2));
addRegReg(MIB, leaInReg, true, leaInReg2, true);
}
if (LV && isKill2 && InsMI2)
LV->replaceKillInstruction(Src2, MI, *InsMI2);
break;
}
}
MachineInstr *NewMI = MIB;
MachineInstr *ExtMI =
BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
.addReg(Dest, RegState::Define | getDeadRegState(isDead))
.addReg(leaOutReg, RegState::Kill, X86::sub_16bit);
if (LV) {
// Update live variables
LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
if (isKill)
LV->replaceKillInstruction(Src, MI, *InsMI);
if (isDead)
LV->replaceKillInstruction(Dest, MI, *ExtMI);
}
return ExtMI;
}
/// This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
/// may be able to convert a two-address instruction into a true
/// three-address instruction on demand. This allows the X86 target (for
/// example) to convert ADD and SHL instructions into LEA instructions if they
/// would require register copies due to two-addressness.
///
/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the new instruction.
///
MachineInstr *
X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
MachineInstr &MI, LiveVariables *LV) const {
// The following opcodes also sets the condition code register(s). Only
// convert them to equivalent lea if the condition code register def's
// are dead!
if (hasLiveCondCodeDef(MI))
return nullptr;
MachineFunction &MF = *MI.getParent()->getParent();
// All instructions input are two-addr instructions. Get the known operands.
const MachineOperand &Dest = MI.getOperand(0);
const MachineOperand &Src = MI.getOperand(1);
MachineInstr *NewMI = nullptr;
// FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's. When
// we have better subtarget support, enable the 16-bit LEA generation here.
// 16-bit LEA is also slow on Core2.
bool DisableLEA16 = true;
bool is64Bit = Subtarget.is64Bit();
unsigned MIOpc = MI.getOpcode();
switch (MIOpc) {
default: return nullptr;
case X86::SHL64ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
// LEA can't handle RSP.
if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
!MF.getRegInfo().constrainRegClass(Src.getReg(),
&X86::GR64_NOSPRegClass))
return nullptr;
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.addOperand(Dest)
.addReg(0)
.addImm(1ULL << ShAmt)
.addOperand(Src)
.addImm(0)
.addReg(0);
break;
}
case X86::SHL32ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
// LEA can't handle ESP.
bool isKill, isUndef;
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB =
BuildMI(MF, MI.getDebugLoc(), get(Opc))
.addOperand(Dest)
.addReg(0)
.addImm(1ULL << ShAmt)
.addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
.addImm(0)
.addReg(0);
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
NewMI = MIB;
break;
}
case X86::SHL16ri: {
assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
: nullptr;
NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addReg(0)
.addImm(1ULL << ShAmt)
.addOperand(Src)
.addImm(0)
.addReg(0);
break;
}
case X86::INC64r:
case X86::INC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill, isUndef;
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB =
BuildMI(MF, MI.getDebugLoc(), get(Opc))
.addOperand(Dest)
.addReg(SrcReg,
getKillRegState(isKill) | getUndefRegState(isUndef));
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
NewMI = addOffset(MIB, 1);
break;
}
case X86::INC16r:
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
: nullptr;
assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addOperand(Src),
1);
break;
case X86::DEC64r:
case X86::DEC32r: {
assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
: (is64Bit ? X86::LEA64_32r : X86::LEA32r);
bool isKill, isUndef;
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.addOperand(Dest)
.addReg(SrcReg, getUndefRegState(isUndef) |
getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
NewMI = addOffset(MIB, -1);
break;
}
case X86::DEC16r:
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
: nullptr;
assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addOperand(Src),
-1);
break;
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD32rr:
case X86::ADD32rr_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc;
if (MIOpc == X86::ADD64rr || MIOpc == X86::ADD64rr_DB)
Opc = X86::LEA64r;
else
Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill, isUndef;
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
const MachineOperand &Src2 = MI.getOperand(2);
bool isKill2, isUndef2;
unsigned SrcReg2;
MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
SrcReg2, isKill2, isUndef2, ImplicitOp2, LV))
return nullptr;
MachineInstrBuilder MIB =
BuildMI(MF, MI.getDebugLoc(), get(Opc)).addOperand(Dest);
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
if (ImplicitOp2.getReg() != 0)
MIB.addOperand(ImplicitOp2);
NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
// Preserve undefness of the operands.
NewMI->getOperand(1).setIsUndef(isUndef);
NewMI->getOperand(3).setIsUndef(isUndef2);
if (LV && Src2.isKill())
LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
break;
}
case X86::ADD16rr:
case X86::ADD16rr_DB: {
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
: nullptr;
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Src2 = MI.getOperand(2).getReg();
bool isKill2 = MI.getOperand(2).isKill();
NewMI = addRegReg(
BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r)).addOperand(Dest),
Src.getReg(), Src.isKill(), Src2, isKill2);
// Preserve undefness of the operands.
bool isUndef = MI.getOperand(1).isUndef();
bool isUndef2 = MI.getOperand(2).isUndef();
NewMI->getOperand(1).setIsUndef(isUndef);
NewMI->getOperand(3).setIsUndef(isUndef2);
if (LV && isKill2)
LV->replaceKillInstruction(Src2, MI, *NewMI);
break;
}
case X86::ADD64ri32:
case X86::ADD64ri8:
case X86::ADD64ri32_DB:
case X86::ADD64ri8_DB:
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
.addOperand(Dest)
.addOperand(Src),
MI.getOperand(2));
break;
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD32ri_DB:
case X86::ADD32ri8_DB: {
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
bool isKill, isUndef;
unsigned SrcReg;
MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
SrcReg, isKill, isUndef, ImplicitOp, LV))
return nullptr;
MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
.addOperand(Dest)
.addReg(SrcReg, getUndefRegState(isUndef) |
getKillRegState(isKill));
if (ImplicitOp.getReg() != 0)
MIB.addOperand(ImplicitOp);
NewMI = addOffset(MIB, MI.getOperand(2));
break;
}
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri_DB:
case X86::ADD16ri8_DB:
if (DisableLEA16)
return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV)
: nullptr;
assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
NewMI = addOffset(BuildMI(MF, MI.getDebugLoc(), get(X86::LEA16r))
.addOperand(Dest)
.addOperand(Src),
MI.getOperand(2));
break;
}
if (!NewMI) return nullptr;
if (LV) { // Update live variables
if (Src.isKill())
LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
if (Dest.isDead())
LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
}
MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
return NewMI;
}
/// This determines which of three possible cases of a three source commute
/// the source indexes correspond to taking into account any mask operands.
/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
/// possible.
/// Case 0 - Possible to commute the first and second operands.
/// Case 1 - Possible to commute the first and third operands.
/// Case 2 - Possible to commute the second and third operands.
static int getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
unsigned SrcOpIdx2) {
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
if (SrcOpIdx1 > SrcOpIdx2)
std::swap(SrcOpIdx1, SrcOpIdx2);
unsigned Op1 = 1, Op2 = 2, Op3 = 3;
if (X86II::isKMasked(TSFlags)) {
// The k-mask operand cannot be commuted.
if (SrcOpIdx1 == 2)
return -1;
// For k-zero-masked operations it is Ok to commute the first vector
// operand.
// For regular k-masked operations a conservative choice is done as the
// elements of the first vector operand, for which the corresponding bit
// in the k-mask operand is set to 0, are copied to the result of the
// instruction.
// TODO/FIXME: The commute still may be legal if it is known that the
// k-mask operand is set to either all ones or all zeroes.
// It is also Ok to commute the 1st operand if all users of MI use only
// the elements enabled by the k-mask operand. For example,
// v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
// : v1[i];
// VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
// // Ok, to commute v1 in FMADD213PSZrk.
if (X86II::isKMergeMasked(TSFlags) && SrcOpIdx1 == Op1)
return -1;
Op2++;
Op3++;
}
if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op2)
return 0;
if (SrcOpIdx1 == Op1 && SrcOpIdx2 == Op3)
return 1;
if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
return 2;
return -1;
}
unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const {
unsigned Opc = MI.getOpcode();
// Put the lowest index to SrcOpIdx1 to simplify the checks below.
if (SrcOpIdx1 > SrcOpIdx2)
std::swap(SrcOpIdx1, SrcOpIdx2);
// TODO: Commuting the 1st operand of FMA*_Int requires some additional
// analysis. The commute optimization is legal only if all users of FMA*_Int
// use only the lowest element of the FMA*_Int instruction. Such analysis are
// not implemented yet. So, just return 0 in that case.
// When such analysis are available this place will be the right place for
// calling it.
if (FMA3Group.isIntrinsic() && SrcOpIdx1 == 1)
return 0;
// Determine which case this commute is or if it can't be done.
int Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1, SrcOpIdx2);
if (Case < 0)
return 0;
// Define the FMA forms mapping array that helps to map input FMA form
// to output FMA form to preserve the operation semantics after
// commuting the operands.
const unsigned Form132Index = 0;
const unsigned Form213Index = 1;
const unsigned Form231Index = 2;
static const unsigned FormMapping[][3] = {
// 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
// FMA132 A, C, b; ==> FMA231 C, A, b;
// FMA213 B, A, c; ==> FMA213 A, B, c;
// FMA231 C, A, b; ==> FMA132 A, C, b;
{ Form231Index, Form213Index, Form132Index },
// 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
// FMA132 A, c, B; ==> FMA132 B, c, A;
// FMA213 B, a, C; ==> FMA231 C, a, B;
// FMA231 C, a, B; ==> FMA213 B, a, C;
{ Form132Index, Form231Index, Form213Index },
// 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
// FMA132 a, C, B; ==> FMA213 a, B, C;
// FMA213 b, A, C; ==> FMA132 b, C, A;
// FMA231 c, A, B; ==> FMA231 c, B, A;
{ Form213Index, Form132Index, Form231Index }
};
unsigned FMAForms[3];
if (FMA3Group.isRegOpcodeFromGroup(Opc)) {
FMAForms[0] = FMA3Group.getReg132Opcode();
FMAForms[1] = FMA3Group.getReg213Opcode();
FMAForms[2] = FMA3Group.getReg231Opcode();
} else {
FMAForms[0] = FMA3Group.getMem132Opcode();
FMAForms[1] = FMA3Group.getMem213Opcode();
FMAForms[2] = FMA3Group.getMem231Opcode();
}
unsigned FormIndex;
for (FormIndex = 0; FormIndex < 3; FormIndex++)
if (Opc == FMAForms[FormIndex])
break;
// Everything is ready, just adjust the FMA opcode and return it.
FormIndex = FormMapping[Case][FormIndex];
return FMAForms[FormIndex];
}
static bool commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
unsigned SrcOpIdx2) {
uint64_t TSFlags = MI.getDesc().TSFlags;
// Determine which case this commute is or if it can't be done.
int Case = getThreeSrcCommuteCase(TSFlags, SrcOpIdx1, SrcOpIdx2);
if (Case < 0)
return false;
// For each case we need to swap two pairs of bits in the final immediate.
static const uint8_t SwapMasks[3][4] = {
{ 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
{ 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
{ 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
};
uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
// Clear out the bits we are swapping.
uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
SwapMasks[Case][2] | SwapMasks[Case][3]);
// If the immediate had a bit of the pair set, then set the opposite bit.
if (Imm & SwapMasks[Case][0]) NewImm |= SwapMasks[Case][1];
if (Imm & SwapMasks[Case][1]) NewImm |= SwapMasks[Case][0];
if (Imm & SwapMasks[Case][2]) NewImm |= SwapMasks[Case][3];
if (Imm & SwapMasks[Case][3]) NewImm |= SwapMasks[Case][2];
MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
return true;
}
// Returns true if this is a VPERMI2 or VPERMT2 instrution that can be
// commuted.
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
#define VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rr: case X86::VPERMT2##Suffix##128rr: \
case X86::VPERMI2##Suffix##256rr: case X86::VPERMT2##Suffix##256rr: \
case X86::VPERMI2##Suffix##rr: case X86::VPERMT2##Suffix##rr: \
case X86::VPERMI2##Suffix##128rm: case X86::VPERMT2##Suffix##128rm: \
case X86::VPERMI2##Suffix##256rm: case X86::VPERMT2##Suffix##256rm: \
case X86::VPERMI2##Suffix##rm: case X86::VPERMT2##Suffix##rm: \
case X86::VPERMI2##Suffix##128rrkz: case X86::VPERMT2##Suffix##128rrkz: \
case X86::VPERMI2##Suffix##256rrkz: case X86::VPERMT2##Suffix##256rrkz: \
case X86::VPERMI2##Suffix##rrkz: case X86::VPERMT2##Suffix##rrkz: \
case X86::VPERMI2##Suffix##128rmkz: case X86::VPERMT2##Suffix##128rmkz: \
case X86::VPERMI2##Suffix##256rmkz: case X86::VPERMT2##Suffix##256rmkz: \
case X86::VPERMI2##Suffix##rmkz: case X86::VPERMT2##Suffix##rmkz:
#define VPERM_CASES_BROADCAST(Suffix) \
VPERM_CASES(Suffix) \
case X86::VPERMI2##Suffix##128rmb: case X86::VPERMT2##Suffix##128rmb: \
case X86::VPERMI2##Suffix##256rmb: case X86::VPERMT2##Suffix##256rmb: \
case X86::VPERMI2##Suffix##rmb: case X86::VPERMT2##Suffix##rmb: \
case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
case X86::VPERMI2##Suffix##rmbkz: case X86::VPERMT2##Suffix##rmbkz:
switch (Opcode) {
default: return false;
VPERM_CASES(B)
VPERM_CASES_BROADCAST(D)
VPERM_CASES_BROADCAST(PD)
VPERM_CASES_BROADCAST(PS)
VPERM_CASES_BROADCAST(Q)
VPERM_CASES(W)
return true;
}
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
// from the I opcod to the T opcode and vice versa.
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
#define VPERM_CASES(Orig, New) \
case X86::Orig##128rr: return X86::New##128rr; \
case X86::Orig##128rrkz: return X86::New##128rrkz; \
case X86::Orig##128rm: return X86::New##128rm; \
case X86::Orig##128rmkz: return X86::New##128rmkz; \
case X86::Orig##256rr: return X86::New##256rr; \
case X86::Orig##256rrkz: return X86::New##256rrkz; \
case X86::Orig##256rm: return X86::New##256rm; \
case X86::Orig##256rmkz: return X86::New##256rmkz; \
case X86::Orig##rr: return X86::New##rr; \
case X86::Orig##rrkz: return X86::New##rrkz; \
case X86::Orig##rm: return X86::New##rm; \
case X86::Orig##rmkz: return X86::New##rmkz;
#define VPERM_CASES_BROADCAST(Orig, New) \
VPERM_CASES(Orig, New) \
case X86::Orig##128rmb: return X86::New##128rmb; \
case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
case X86::Orig##256rmb: return X86::New##256rmb; \
case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
case X86::Orig##rmb: return X86::New##rmb; \
case X86::Orig##rmbkz: return X86::New##rmbkz;
switch (Opcode) {
VPERM_CASES(VPERMI2B, VPERMT2B)
VPERM_CASES_BROADCAST(VPERMI2D, VPERMT2D)
VPERM_CASES_BROADCAST(VPERMI2PD, VPERMT2PD)
VPERM_CASES_BROADCAST(VPERMI2PS, VPERMT2PS)
VPERM_CASES_BROADCAST(VPERMI2Q, VPERMT2Q)
VPERM_CASES(VPERMI2W, VPERMT2W)
VPERM_CASES(VPERMT2B, VPERMI2B)
VPERM_CASES_BROADCAST(VPERMT2D, VPERMI2D)
VPERM_CASES_BROADCAST(VPERMT2PD, VPERMI2PD)
VPERM_CASES_BROADCAST(VPERMT2PS, VPERMI2PS)
VPERM_CASES_BROADCAST(VPERMT2Q, VPERMI2Q)
VPERM_CASES(VPERMT2W, VPERMI2W)
}
llvm_unreachable("Unreachable!");
#undef VPERM_CASES_BROADCAST
#undef VPERM_CASES
}
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned OpIdx1,
unsigned OpIdx2) const {
auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
if (NewMI)
return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
return MI;
};
switch (MI.getOpcode()) {
case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
unsigned Opc;
unsigned Size;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
}
unsigned Amt = MI.getOperand(3).getImm();
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
WorkingMI.getOperand(3).setImm(Size - Amt);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
case X86::PBLENDWrri:
case X86::VBLENDPDrri:
case X86::VBLENDPSrri:
case X86::VBLENDPDYrri:
case X86::VBLENDPSYrri:
case X86::VPBLENDDrri:
case X86::VPBLENDWrri:
case X86::VPBLENDDYrri:
case X86::VPBLENDWYrri:{
unsigned Mask;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::BLENDPDrri: Mask = 0x03; break;
case X86::BLENDPSrri: Mask = 0x0F; break;
case X86::PBLENDWrri: Mask = 0xFF; break;
case X86::VBLENDPDrri: Mask = 0x03; break;
case X86::VBLENDPSrri: Mask = 0x0F; break;
case X86::VBLENDPDYrri: Mask = 0x0F; break;
case X86::VBLENDPSYrri: Mask = 0xFF; break;
case X86::VPBLENDDrri: Mask = 0x0F; break;
case X86::VPBLENDWrri: Mask = 0xFF; break;
case X86::VPBLENDDYrri: Mask = 0xFF; break;
case X86::VPBLENDWYrri: Mask = 0xFF; break;
}
// Only the least significant bits of Imm are used.
unsigned Imm = MI.getOperand(3).getImm() & Mask;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Mask ^ Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
case X86::VMOVSSrr:{
// On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
if (!Subtarget.hasSSE41())
return nullptr;
unsigned Mask, Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break;
case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break;
case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break;
case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
}
// MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
// this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
auto &MRI = MI.getParent()->getParent()->getRegInfo();
auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
unsigned VR128 = MRI.createVirtualRegister(VR128RC);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
VR128)
.addReg(MI.getOperand(2).getReg());
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
WorkingMI.getOperand(2).setReg(VR128);
WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::PCLMULQDQrr:
case X86::VPCLMULQDQrr:{
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
unsigned Imm = MI.getOperand(3).getImm();
unsigned Src1Hi = Imm & 0x01;
unsigned Src2Hi = Imm & 0x10;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::CMPSDrr:
case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
case X86::VCMPSDrr:
case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
case X86::VCMPPSYrri:
case X86::VCMPSDZrr:
case X86::VCMPSSZrr:
case X86::VCMPPDZrri:
case X86::VCMPPSZrri:
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
default:
return nullptr;
}
}
case X86::VPCMPBZ128rri: case X86::VPCMPUBZ128rri:
case X86::VPCMPBZ256rri: case X86::VPCMPUBZ256rri:
case X86::VPCMPBZrri: case X86::VPCMPUBZrri:
case X86::VPCMPDZ128rri: case X86::VPCMPUDZ128rri:
case X86::VPCMPDZ256rri: case X86::VPCMPUDZ256rri:
case X86::VPCMPDZrri: case X86::VPCMPUDZrri:
case X86::VPCMPQZ128rri: case X86::VPCMPUQZ128rri:
case X86::VPCMPQZ256rri: case X86::VPCMPUQZ256rri:
case X86::VPCMPQZrri: case X86::VPCMPUQZrri:
case X86::VPCMPWZ128rri: case X86::VPCMPUWZ128rri:
case X86::VPCMPWZ256rri: case X86::VPCMPUWZ256rri:
case X86::VPCMPWZrri: case X86::VPCMPUWZrri: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
default: llvm_unreachable("Unreachable!");
case 0x01: Imm = 0x06; break; // LT -> NLE
case 0x02: Imm = 0x05; break; // LE -> NLT
case 0x05: Imm = 0x02; break; // NLT -> LE
case 0x06: Imm = 0x01; break; // NLE -> LT
case 0x00: // EQ
case 0x03: // FALSE
case 0x04: // NE
case 0x07: // TRUE
break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPCOMBri: case X86::VPCOMUBri:
case X86::VPCOMDri: case X86::VPCOMUDri:
case X86::VPCOMQri: case X86::VPCOMUQri:
case X86::VPCOMWri: case X86::VPCOMUWri: {
// Flip comparison mode immediate (if necessary).
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
default: llvm_unreachable("Unreachable!");
case 0x00: Imm = 0x02; break; // LT -> GT
case 0x01: Imm = 0x03; break; // LE -> GE
case 0x02: Imm = 0x00; break; // GT -> LT
case 0x03: Imm = 0x01; break; // GE -> LE
case 0x04: // EQ
case 0x05: // NE
case 0x06: // FALSE
case 0x07: // TRUE
break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPERM2F128rr:
case X86::VPERM2I128rr: {
// Flip permute source immediate.
// Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
// Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
unsigned Imm = MI.getOperand(3).getImm() & 0xFF;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::MOVHLPSrr:
case X86::UNPCKHPDrr: {
if (!Subtarget.hasSSE2())
return nullptr;
unsigned Opc = MI.getOpcode();
switch (Opc) {
default: llvm_unreachable("Unreachable!");
case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break;
case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr:
case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
case X86::CMOVA16rr: case X86::CMOVA32rr: case X86::CMOVA64rr:
case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
unsigned Opc;
switch (MI.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::CMOVB16rr: Opc = X86::CMOVAE16rr; break;
case X86::CMOVB32rr: Opc = X86::CMOVAE32rr; break;
case X86::CMOVB64rr: Opc = X86::CMOVAE64rr; break;
case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
case X86::CMOVE16rr: Opc = X86::CMOVNE16rr; break;
case X86::CMOVE32rr: Opc = X86::CMOVNE32rr; break;
case X86::CMOVE64rr: Opc = X86::CMOVNE64rr; break;
case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
case X86::CMOVA16rr: Opc = X86::CMOVBE16rr; break;
case X86::CMOVA32rr: Opc = X86::CMOVBE32rr; break;
case X86::CMOVA64rr: Opc = X86::CMOVBE64rr; break;
case X86::CMOVL16rr: Opc = X86::CMOVGE16rr; break;
case X86::CMOVL32rr: Opc = X86::CMOVGE32rr; break;
case X86::CMOVL64rr: Opc = X86::CMOVGE64rr; break;
case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
case X86::CMOVG16rr: Opc = X86::CMOVLE16rr; break;
case X86::CMOVG32rr: Opc = X86::CMOVLE32rr; break;
case X86::CMOVG64rr: Opc = X86::CMOVLE64rr; break;
case X86::CMOVS16rr: Opc = X86::CMOVNS16rr; break;
case X86::CMOVS32rr: Opc = X86::CMOVNS32rr; break;
case X86::CMOVS64rr: Opc = X86::CMOVNS64rr; break;
case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
case X86::CMOVP16rr: Opc = X86::CMOVNP16rr; break;
case X86::CMOVP32rr: Opc = X86::CMOVNP32rr; break;
case X86::CMOVP64rr: Opc = X86::CMOVNP64rr; break;
case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
case X86::CMOVO16rr: Opc = X86::CMOVNO16rr; break;
case X86::CMOVO32rr: Opc = X86::CMOVNO32rr; break;
case X86::CMOVO64rr: Opc = X86::CMOVNO64rr; break;
case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
}
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
auto &WorkingMI = cloneIfNew(MI);
if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
return nullptr;
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
default: {
if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
if (FMA3Group) {
unsigned Opc =
getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
if (Opc == 0)
return nullptr;
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
}
}
}
bool X86InstrInfo::findFMA3CommutedOpIndices(
const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const {
if (!findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2))
return false;
// Check if we can adjust the opcode to preserve the semantics when
// commute the register operands.
return getFMA3OpcodeToCommuteOperands(MI, SrcOpIdx1, SrcOpIdx2, FMA3Group) != 0;
}
bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
uint64_t TSFlags = MI.getDesc().TSFlags;
unsigned FirstCommutableVecOp = 1;
unsigned LastCommutableVecOp = 3;
unsigned KMaskOp = 0;
if (X86II::isKMasked(TSFlags)) {
// The k-mask operand has index = 2 for masked and zero-masked operations.
KMaskOp = 2;
// The operand with index = 1 is used as a source for those elements for
// which the corresponding bit in the k-mask is set to 0.
if (X86II::isKMergeMasked(TSFlags))
FirstCommutableVecOp = 3;
LastCommutableVecOp++;
}
if (isMem(MI, LastCommutableVecOp))
LastCommutableVecOp--;
// Only the first RegOpsNum operands are commutable.
// Also, the value 'CommuteAnyOperandIndex' is valid here as it means
// that the operand is not specified/fixed.
if (SrcOpIdx1 != CommuteAnyOperandIndex &&
(SrcOpIdx1 < FirstCommutableVecOp || SrcOpIdx1 > LastCommutableVecOp ||
SrcOpIdx1 == KMaskOp))
return false;
if (SrcOpIdx2 != CommuteAnyOperandIndex &&
(SrcOpIdx2 < FirstCommutableVecOp || SrcOpIdx2 > LastCommutableVecOp ||
SrcOpIdx2 == KMaskOp))
return false;
// Look for two different register operands assumed to be commutable
// regardless of the FMA opcode. The FMA opcode is adjusted later.
if (SrcOpIdx1 == CommuteAnyOperandIndex ||
SrcOpIdx2 == CommuteAnyOperandIndex) {
unsigned CommutableOpIdx1 = SrcOpIdx1;
unsigned CommutableOpIdx2 = SrcOpIdx2;
// At least one of operands to be commuted is not specified and
// this method is free to choose appropriate commutable operands.
if (SrcOpIdx1 == SrcOpIdx2)
// Both of operands are not fixed. By default set one of commutable
// operands to the last register operand of the instruction.
CommutableOpIdx2 = LastCommutableVecOp;
else if (SrcOpIdx2 == CommuteAnyOperandIndex)
// Only one of operands is not fixed.
CommutableOpIdx2 = SrcOpIdx1;
// CommutableOpIdx2 is well defined now. Let's choose another commutable
// operand and assign its index to CommutableOpIdx1.
unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
for (CommutableOpIdx1 = LastCommutableVecOp;
CommutableOpIdx1 >= FirstCommutableVecOp; CommutableOpIdx1--) {
// Just ignore and skip the k-mask operand.
if (CommutableOpIdx1 == KMaskOp)
continue;
// The commuted operands must have different registers.
// Otherwise, the commute transformation does not change anything and
// is useless then.
if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
break;
}
// No appropriate commutable operands were found.
if (CommutableOpIdx1 < FirstCommutableVecOp)
return false;
// Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
// to return those values.
if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
CommutableOpIdx1, CommutableOpIdx2))
return false;
}
return true;
}
bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const {
const MCInstrDesc &Desc = MI.getDesc();
if (!Desc.isCommutable())
return false;
switch (MI.getOpcode()) {
case X86::CMPSDrr:
case X86::CMPSSrr:
case X86::CMPPDrri:
case X86::CMPPSrri:
case X86::VCMPSDrr:
case X86::VCMPSSrr:
case X86::VCMPPDrri:
case X86::VCMPPSrri:
case X86::VCMPPDYrri:
case X86::VCMPPSYrri:
case X86::VCMPSDZrr:
case X86::VCMPSSZrr:
case X86::VCMPPDZrri:
case X86::VCMPPSZrri:
case X86::VCMPPDZ128rri:
case X86::VCMPPSZ128rri:
case X86::VCMPPDZ256rri:
case X86::VCMPPSZ256rri: {
// Float comparison can be safely commuted for
// Ordered/Unordered/Equal/NotEqual tests
unsigned Imm = MI.getOperand(3).getImm() & 0x7;
switch (Imm) {
case 0x00: // EQUAL
case 0x03: // UNORDERED
case 0x04: // NOT EQUAL
case 0x07: // ORDERED
// The indices of the commutable operands are 1 and 2.
// Assign them to the returned operand indices here.
return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1, 2);
}
return false;
}
case X86::MOVSDrr:
case X86::MOVSSrr:
case X86::VMOVSDrr:
case X86::VMOVSSrr: {
if (Subtarget.hasSSE41())
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
return false;
}
case X86::VPTERNLOGDZrri: case X86::VPTERNLOGDZrmi:
case X86::VPTERNLOGDZ128rri: case X86::VPTERNLOGDZ128rmi:
case X86::VPTERNLOGDZ256rri: case X86::VPTERNLOGDZ256rmi:
case X86::VPTERNLOGQZrri: case X86::VPTERNLOGQZrmi:
case X86::VPTERNLOGQZ128rri: case X86::VPTERNLOGQZ128rmi:
case X86::VPTERNLOGQZ256rri: case X86::VPTERNLOGQZ256rmi:
case X86::VPTERNLOGDZrrik: case X86::VPTERNLOGDZrmik:
case X86::VPTERNLOGDZ128rrik: case X86::VPTERNLOGDZ128rmik:
case X86::VPTERNLOGDZ256rrik: case X86::VPTERNLOGDZ256rmik:
case X86::VPTERNLOGQZrrik: case X86::VPTERNLOGQZrmik:
case X86::VPTERNLOGQZ128rrik: case X86::VPTERNLOGQZ128rmik:
case X86::VPTERNLOGQZ256rrik: case X86::VPTERNLOGQZ256rmik:
case X86::VPTERNLOGDZrrikz: case X86::VPTERNLOGDZrmikz:
case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
case X86::VPTERNLOGQZrrikz: case X86::VPTERNLOGQZrmikz:
case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
default:
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
if (FMA3Group)
return findFMA3CommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2, *FMA3Group);
// Handled masked instructions since we need to skip over the mask input
// and the preserved input.
if (Desc.TSFlags & X86II::EVEX_K) {
// First assume that the first input is the mask operand and skip past it.
unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
// Check if the first input is tied. If there isn't one then we only
// need to skip the mask operand which we did above.
if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
MCOI::TIED_TO) != -1)) {
// If this is zero masking instruction with a tied operand, we need to
// move the first index back to the first input since this must
// be a 3 input instruction and we want the first two non-mask inputs.
// Otherwise this is a 2 input instruction with a preserved input and
// mask, so we need to move the indices to skip one more input.
if (Desc.TSFlags & X86II::EVEX_Z)
--CommutableOpIdx1;
else {
++CommutableOpIdx1;
++CommutableOpIdx2;
}
}
if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
CommutableOpIdx1, CommutableOpIdx2))
return false;
if (!MI.getOperand(SrcOpIdx1).isReg() ||
!MI.getOperand(SrcOpIdx2).isReg())
// No idea.
return false;
return true;
}
return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
}
return false;
}
static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) {
switch (BrOpc) {
default: return X86::COND_INVALID;
case X86::JE_1: return X86::COND_E;
case X86::JNE_1: return X86::COND_NE;
case X86::JL_1: return X86::COND_L;
case X86::JLE_1: return X86::COND_LE;
case X86::JG_1: return X86::COND_G;
case X86::JGE_1: return X86::COND_GE;
case X86::JB_1: return X86::COND_B;
case X86::JBE_1: return X86::COND_BE;
case X86::JA_1: return X86::COND_A;
case X86::JAE_1: return X86::COND_AE;
case X86::JS_1: return X86::COND_S;
case X86::JNS_1: return X86::COND_NS;
case X86::JP_1: return X86::COND_P;
case X86::JNP_1: return X86::COND_NP;
case X86::JO_1: return X86::COND_O;
case X86::JNO_1: return X86::COND_NO;
}
}
/// Return condition code of a SET opcode.
static X86::CondCode getCondFromSETOpc(unsigned Opc) {
switch (Opc) {
default: return X86::COND_INVALID;
case X86::SETAr: case X86::SETAm: return X86::COND_A;
case X86::SETAEr: case X86::SETAEm: return X86::COND_AE;
case X86::SETBr: case X86::SETBm: return X86::COND_B;
case X86::SETBEr: case X86::SETBEm: return X86::COND_BE;
case X86::SETEr: case X86::SETEm: return X86::COND_E;
case X86::SETGr: case X86::SETGm: return X86::COND_G;
case X86::SETGEr: case X86::SETGEm: return X86::COND_GE;
case X86::SETLr: case X86::SETLm: return X86::COND_L;
case X86::SETLEr: case X86::SETLEm: return X86::COND_LE;
case X86::SETNEr: case X86::SETNEm: return X86::COND_NE;
case X86::SETNOr: case X86::SETNOm: return X86::COND_NO;
case X86::SETNPr: case X86::SETNPm: return X86::COND_NP;
case X86::SETNSr: case X86::SETNSm: return X86::COND_NS;
case X86::SETOr: case X86::SETOm: return X86::COND_O;
case X86::SETPr: case X86::SETPm: return X86::COND_P;
case X86::SETSr: case X86::SETSm: return X86::COND_S;
}
}
/// Return condition code of a CMov opcode.
X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
switch (Opc) {
default: return X86::COND_INVALID;
case X86::CMOVA16rm: case X86::CMOVA16rr: case X86::CMOVA32rm:
case X86::CMOVA32rr: case X86::CMOVA64rm: case X86::CMOVA64rr:
return X86::COND_A;
case X86::CMOVAE16rm: case X86::CMOVAE16rr: case X86::CMOVAE32rm:
case X86::CMOVAE32rr: case X86::CMOVAE64rm: case X86::CMOVAE64rr:
return X86::COND_AE;
case X86::CMOVB16rm: case X86::CMOVB16rr: case X86::CMOVB32rm:
case X86::CMOVB32rr: case X86::CMOVB64rm: case X86::CMOVB64rr:
return X86::COND_B;
case X86::CMOVBE16rm: case X86::CMOVBE16rr: case X86::CMOVBE32rm:
case X86::CMOVBE32rr: case X86::CMOVBE64rm: case X86::CMOVBE64rr:
return X86::COND_BE;
case X86::CMOVE16rm: case X86::CMOVE16rr: case X86::CMOVE32rm:
case X86::CMOVE32rr: case X86::CMOVE64rm: case X86::CMOVE64rr:
return X86::COND_E;
case X86::CMOVG16rm: case X86::CMOVG16rr: case X86::CMOVG32rm:
case X86::CMOVG32rr: case X86::CMOVG64rm: case X86::CMOVG64rr:
return X86::COND_G;
case X86::CMOVGE16rm: case X86::CMOVGE16rr: case X86::CMOVGE32rm:
case X86::CMOVGE32rr: case X86::CMOVGE64rm: case X86::CMOVGE64rr:
return X86::COND_GE;
case X86::CMOVL16rm: case X86::CMOVL16rr: case X86::CMOVL32rm:
case X86::CMOVL32rr: case X86::CMOVL64rm: case X86::CMOVL64rr:
return X86::COND_L;
case X86::CMOVLE16rm: case X86::CMOVLE16rr: case X86::CMOVLE32rm:
case X86::CMOVLE32rr: case X86::CMOVLE64rm: case X86::CMOVLE64rr:
return X86::COND_LE;
case X86::CMOVNE16rm: case X86::CMOVNE16rr: case X86::CMOVNE32rm:
case X86::CMOVNE32rr: case X86::CMOVNE64rm: case X86::CMOVNE64rr:
return X86::COND_NE;
case X86::CMOVNO16rm: case X86::CMOVNO16rr: case X86::CMOVNO32rm:
case X86::CMOVNO32rr: case X86::CMOVNO64rm: case X86::CMOVNO64rr:
return X86::COND_NO;
case X86::CMOVNP16rm: case X86::CMOVNP16rr: case X86::CMOVNP32rm:
case X86::CMOVNP32rr: case X86::CMOVNP64rm: case X86::CMOVNP64rr:
return X86::COND_NP;
case X86::CMOVNS16rm: case X86::CMOVNS16rr: case X86::CMOVNS32rm:
case X86::CMOVNS32rr: case X86::CMOVNS64rm: case X86::CMOVNS64rr:
return X86::COND_NS;
case X86::CMOVO16rm: case X86::CMOVO16rr: case X86::CMOVO32rm:
case X86::CMOVO32rr: case X86::CMOVO64rm: case X86::CMOVO64rr:
return X86::COND_O;
case X86::CMOVP16rm: case X86::CMOVP16rr: case X86::CMOVP32rm:
case X86::CMOVP32rr: case X86::CMOVP64rm: case X86::CMOVP64rr:
return X86::COND_P;
case X86::CMOVS16rm: case X86::CMOVS16rr: case X86::CMOVS32rm:
case X86::CMOVS32rr: case X86::CMOVS64rm: case X86::CMOVS64rr:
return X86::COND_S;
}
}
unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Illegal condition code!");
case X86::COND_E: return X86::JE_1;
case X86::COND_NE: return X86::JNE_1;
case X86::COND_L: return X86::JL_1;
case X86::COND_LE: return X86::JLE_1;
case X86::COND_G: return X86::JG_1;
case X86::COND_GE: return X86::JGE_1;
case X86::COND_B: return X86::JB_1;
case X86::COND_BE: return X86::JBE_1;
case X86::COND_A: return X86::JA_1;
case X86::COND_AE: return X86::JAE_1;
case X86::COND_S: return X86::JS_1;
case X86::COND_NS: return X86::JNS_1;
case X86::COND_P: return X86::JP_1;
case X86::COND_NP: return X86::JNP_1;
case X86::COND_O: return X86::JO_1;
case X86::COND_NO: return X86::JNO_1;
}
}
/// Return the inverse of the specified condition,
/// e.g. turning COND_E to COND_NE.
X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
switch (CC) {
default: llvm_unreachable("Illegal condition code!");
case X86::COND_E: return X86::COND_NE;
case X86::COND_NE: return X86::COND_E;
case X86::COND_L: return X86::COND_GE;
case X86::COND_LE: return X86::COND_G;
case X86::COND_G: return X86::COND_LE;
case X86::COND_GE: return X86::COND_L;
case X86::COND_B: return X86::COND_AE;
case X86::COND_BE: return X86::COND_A;
case X86::COND_A: return X86::COND_BE;
case X86::COND_AE: return X86::COND_B;
case X86::COND_S: return X86::COND_NS;
case X86::COND_NS: return X86::COND_S;
case X86::COND_P: return X86::COND_NP;
case X86::COND_NP: return X86::COND_P;
case X86::COND_O: return X86::COND_NO;
case X86::COND_NO: return X86::COND_O;
case X86::COND_NE_OR_P: return X86::COND_E_AND_NP;
case X86::COND_E_AND_NP: return X86::COND_NE_OR_P;
}
}
/// Assuming the flags are set by MI(a,b), return the condition code if we
/// modify the instructions such that flags are set by MI(b,a).
static X86::CondCode getSwappedCondition(X86::CondCode CC) {
switch (CC) {
default: return X86::COND_INVALID;
case X86::COND_E: return X86::COND_E;
case X86::COND_NE: return X86::COND_NE;
case X86::COND_L: return X86::COND_G;
case X86::COND_LE: return X86::COND_GE;
case X86::COND_G: return X86::COND_L;
case X86::COND_GE: return X86::COND_LE;
case X86::COND_B: return X86::COND_A;
case X86::COND_BE: return X86::COND_AE;
case X86::COND_A: return X86::COND_B;
case X86::COND_AE: return X86::COND_BE;
}
}
/// Return a set opcode for the given condition and
/// whether it has memory operand.
unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) {
static const uint16_t Opc[16][2] = {
{ X86::SETAr, X86::SETAm },
{ X86::SETAEr, X86::SETAEm },
{ X86::SETBr, X86::SETBm },
{ X86::SETBEr, X86::SETBEm },
{ X86::SETEr, X86::SETEm },
{ X86::SETGr, X86::SETGm },
{ X86::SETGEr, X86::SETGEm },
{ X86::SETLr, X86::SETLm },
{ X86::SETLEr, X86::SETLEm },
{ X86::SETNEr, X86::SETNEm },
{ X86::SETNOr, X86::SETNOm },
{ X86::SETNPr, X86::SETNPm },
{ X86::SETNSr, X86::SETNSm },
{ X86::SETOr, X86::SETOm },
{ X86::SETPr, X86::SETPm },
{ X86::SETSr, X86::SETSm }
};
assert(CC <= LAST_VALID_COND && "Can only handle standard cond codes");
return Opc[CC][HasMemoryOperand ? 1 : 0];
}
/// Return a cmov opcode for the given condition,
/// register size in bytes, and operand type.
unsigned X86::getCMovFromCond(CondCode CC, unsigned RegBytes,
bool HasMemoryOperand) {
static const uint16_t Opc[32][3] = {
{ X86::CMOVA16rr, X86::CMOVA32rr, X86::CMOVA64rr },
{ X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
{ X86::CMOVB16rr, X86::CMOVB32rr, X86::CMOVB64rr },
{ X86::CMOVBE16rr, X86::CMOVBE32rr, X86::CMOVBE64rr },
{ X86::CMOVE16rr, X86::CMOVE32rr, X86::CMOVE64rr },
{ X86::CMOVG16rr, X86::CMOVG32rr, X86::CMOVG64rr },
{ X86::CMOVGE16rr, X86::CMOVGE32rr, X86::CMOVGE64rr },
{ X86::CMOVL16rr, X86::CMOVL32rr, X86::CMOVL64rr },
{ X86::CMOVLE16rr, X86::CMOVLE32rr, X86::CMOVLE64rr },
{ X86::CMOVNE16rr, X86::CMOVNE32rr, X86::CMOVNE64rr },
{ X86::CMOVNO16rr, X86::CMOVNO32rr, X86::CMOVNO64rr },
{ X86::CMOVNP16rr, X86::CMOVNP32rr, X86::CMOVNP64rr },
{ X86::CMOVNS16rr, X86::CMOVNS32rr, X86::CMOVNS64rr },
{ X86::CMOVO16rr, X86::CMOVO32rr, X86::CMOVO64rr },
{ X86::CMOVP16rr, X86::CMOVP32rr, X86::CMOVP64rr },
{ X86::CMOVS16rr, X86::CMOVS32rr, X86::CMOVS64rr },
{ X86::CMOVA16rm, X86::CMOVA32rm, X86::CMOVA64rm },
{ X86::CMOVAE16rm, X86::CMOVAE32rm, X86::CMOVAE64rm },
{ X86::CMOVB16rm, X86::CMOVB32rm, X86::CMOVB64rm },
{ X86::CMOVBE16rm, X86::CMOVBE32rm, X86::CMOVBE64rm },
{ X86::CMOVE16rm, X86::CMOVE32rm, X86::CMOVE64rm },
{ X86::CMOVG16rm, X86::CMOVG32rm, X86::CMOVG64rm },
{ X86::CMOVGE16rm, X86::CMOVGE32rm, X86::CMOVGE64rm },
{ X86::CMOVL16rm, X86::CMOVL32rm, X86::CMOVL64rm },
{ X86::CMOVLE16rm, X86::CMOVLE32rm, X86::CMOVLE64rm },
{ X86::CMOVNE16rm, X86::CMOVNE32rm, X86::CMOVNE64rm },
{ X86::CMOVNO16rm, X86::CMOVNO32rm, X86::CMOVNO64rm },
{ X86::CMOVNP16rm, X86::CMOVNP32rm, X86::CMOVNP64rm },
{ X86::CMOVNS16rm, X86::CMOVNS32rm, X86::CMOVNS64rm },
{ X86::CMOVO16rm, X86::CMOVO32rm, X86::CMOVO64rm },
{ X86::CMOVP16rm, X86::CMOVP32rm, X86::CMOVP64rm },
{ X86::CMOVS16rm, X86::CMOVS32rm, X86::CMOVS64rm }
};
assert(CC < 16 && "Can only handle standard cond codes");
unsigned Idx = HasMemoryOperand ? 16+CC : CC;
switch(RegBytes) {
default: llvm_unreachable("Illegal register size!");
case 2: return Opc[Idx][0];
case 4: return Opc[Idx][1];
case 8: return Opc[Idx][2];
}
}
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
if (!MI.isTerminator()) return false;
// Conditional branch is a special case.
if (MI.isBranch() && !MI.isBarrier())
return true;
if (!MI.isPredicable())
return true;
return !isPredicated(MI);
}
-bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
- switch (MI.getOpcode()) {
- case X86::TCRETURNdi:
- case X86::TCRETURNri:
- case X86::TCRETURNmi:
- case X86::TCRETURNdi64:
- case X86::TCRETURNri64:
- case X86::TCRETURNmi64:
- return true;
- default:
- return false;
- }
-}
-
-bool X86InstrInfo::canMakeTailCallConditional(
- SmallVectorImpl<MachineOperand> &BranchCond,
- const MachineInstr &TailCall) const {
- if (TailCall.getOpcode() != X86::TCRETURNdi &&
- TailCall.getOpcode() != X86::TCRETURNdi64) {
- // Only direct calls can be done with a conditional branch.
- return false;
- }
-
- if (Subtarget.isTargetWin64()) {
- // Conditional tail calls confuse the Win64 unwinder.
- // TODO: Allow them for "leaf" functions; PR30337.
- return false;
- }
-
- assert(BranchCond.size() == 1);
- if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
- // Can't make a conditional tail call with this condition.
- return false;
- }
-
- const X86MachineFunctionInfo *X86FI =
- TailCall.getParent()->getParent()->getInfo<X86MachineFunctionInfo>();
- if (X86FI->getTCReturnAddrDelta() != 0 ||
- TailCall.getOperand(1).getImm() != 0) {
- // A conditional tail call cannot do any stack adjustment.
- return false;
- }
-
- return true;
-}
-
-void X86InstrInfo::replaceBranchWithTailCall(
- MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
- const MachineInstr &TailCall) const {
- assert(canMakeTailCallConditional(BranchCond, TailCall));
-
- MachineBasicBlock::iterator I = MBB.end();
- while (I != MBB.begin()) {
- --I;
- if (I->isDebugValue())
- continue;
- if (!I->isBranch())
- assert(0 && "Can't find the branch to replace!");
-
- X86::CondCode CC = getCondFromBranchOpc(I->getOpcode());
- assert(BranchCond.size() == 1);
- if (CC != BranchCond[0].getImm())
- continue;
-
- break;
- }
-
- unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? X86::TCRETURNdicc
- : X86::TCRETURNdi64cc;
-
- auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
- MIB->addOperand(TailCall.getOperand(0)); // Destination.
- MIB.addImm(0); // Stack offset (not used).
- MIB->addOperand(BranchCond[0]); // Condition.
- MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
-
- I->eraseFromParent();
-}
-
// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
// not be a fallthrough MBB now due to layout changes). Return nullptr if the
// fallthrough MBB cannot be identified.
static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
MachineBasicBlock *TBB) {
// Look for non-EHPad successors other than TBB. If we find exactly one, it
// is the fallthrough MBB. If we find zero, then TBB is both the target MBB
// and fallthrough MBB. If we find more than one, we cannot identify the
// fallthrough MBB and should return nullptr.
MachineBasicBlock *FallthroughBB = nullptr;
for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) {
if ((*SI)->isEHPad() || (*SI == TBB && FallthroughBB))
continue;
// Return a nullptr if we found more than one fallthrough successor.
if (FallthroughBB && FallthroughBB != TBB)
return nullptr;
FallthroughBB = *SI;
}
return FallthroughBB;
}
bool X86InstrInfo::AnalyzeBranchImpl(
MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
// Start from the bottom of the block and work up, examining the
// terminator instructions.
MachineBasicBlock::iterator I = MBB.end();
MachineBasicBlock::iterator UnCondBrIter = MBB.end();
while (I != MBB.begin()) {
--I;
if (I->isDebugValue())
continue;
// Working from the bottom, when we see a non-terminator instruction, we're
// done.
if (!isUnpredicatedTerminator(*I))
break;
// A terminator that isn't a branch can't easily be handled by this
// analysis.
if (!I->isBranch())
return true;
// Handle unconditional branches.
if (I->getOpcode() == X86::JMP_1) {
UnCondBrIter = I;
if (!AllowModify) {
TBB = I->getOperand(0).getMBB();
continue;
}
// If the block has any instructions after a JMP, delete them.
while (std::next(I) != MBB.end())
std::next(I)->eraseFromParent();
Cond.clear();
FBB = nullptr;
// Delete the JMP if it's equivalent to a fall-through.
if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
TBB = nullptr;
I->eraseFromParent();
I = MBB.end();
UnCondBrIter = MBB.end();
continue;
}
// TBB is used to indicate the unconditional destination.
TBB = I->getOperand(0).getMBB();
continue;
}
// Handle conditional branches.
X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode());
if (BranchCode == X86::COND_INVALID)
return true; // Can't handle indirect branch.
// Working from the bottom, handle the first conditional branch.
if (Cond.empty()) {
MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
if (AllowModify && UnCondBrIter != MBB.end() &&
MBB.isLayoutSuccessor(TargetBB)) {
// If we can modify the code and it ends in something like:
//
// jCC L1
// jmp L2
// L1:
// ...
// L2:
//
// Then we can change this to:
//
// jnCC L2
// L1:
// ...
// L2:
//
// Which is a bit more efficient.
// We conditionally jump to the fall-through block.
BranchCode = GetOppositeBranchCondition(BranchCode);
unsigned JNCC = GetCondBranchFromCond(BranchCode);
MachineBasicBlock::iterator OldInst = I;
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(JNCC))
.addMBB(UnCondBrIter->getOperand(0).getMBB());
BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
.addMBB(TargetBB);
OldInst->eraseFromParent();
UnCondBrIter->eraseFromParent();
// Restart the analysis.
UnCondBrIter = MBB.end();
I = MBB.end();
continue;
}
FBB = TBB;
TBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(BranchCode));
CondBranches.push_back(&*I);
continue;
}
// Handle subsequent conditional branches. Only handle the case where all
// conditional branches branch to the same destination and their condition
// opcodes fit one of the special multi-branch idioms.
assert(Cond.size() == 1);
assert(TBB);
// If the conditions are the same, we can leave them alone.
X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
auto NewTBB = I->getOperand(0).getMBB();
if (OldBranchCode == BranchCode && TBB == NewTBB)
continue;
// If they differ, see if they fit one of the known patterns. Theoretically,
// we could handle more patterns here, but we shouldn't expect to see them
// if instruction selection has done a reasonable job.
if (TBB == NewTBB &&
((OldBranchCode == X86::COND_P && BranchCode == X86::COND_NE) ||
(OldBranchCode == X86::COND_NE && BranchCode == X86::COND_P))) {
BranchCode = X86::COND_NE_OR_P;
} else if ((OldBranchCode == X86::COND_NP && BranchCode == X86::COND_NE) ||
(OldBranchCode == X86::COND_E && BranchCode == X86::COND_P)) {
if (NewTBB != (FBB ? FBB : getFallThroughMBB(&MBB, TBB)))
return true;
// X86::COND_E_AND_NP usually has two different branch destinations.
//
// JP B1
// JE B2
// JMP B1
// B1:
// B2:
//
// Here this condition branches to B2 only if NP && E. It has another
// equivalent form:
//
// JNE B1
// JNP B2
// JMP B1
// B1:
// B2:
//
// Similarly it branches to B2 only if E && NP. That is why this condition
// is named with COND_E_AND_NP.
BranchCode = X86::COND_E_AND_NP;
} else
return true;
// Update the MachineOperand.
Cond[0].setImm(BranchCode);
CondBranches.push_back(&*I);
}
return false;
}
bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const {
SmallVector<MachineInstr *, 4> CondBranches;
return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
}
bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
MachineBranchPredicate &MBP,
bool AllowModify) const {
using namespace std::placeholders;
SmallVector<MachineOperand, 4> Cond;
SmallVector<MachineInstr *, 4> CondBranches;
if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
AllowModify))
return true;
if (Cond.size() != 1)
return true;
assert(MBP.TrueDest && "expected!");
if (!MBP.FalseDest)
MBP.FalseDest = MBB.getNextNode();
const TargetRegisterInfo *TRI = &getRegisterInfo();
MachineInstr *ConditionDef = nullptr;
bool SingleUseCondition = true;
for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; ++I) {
if (I->modifiesRegister(X86::EFLAGS, TRI)) {
ConditionDef = &*I;
break;
}
if (I->readsRegister(X86::EFLAGS, TRI))
SingleUseCondition = false;
}
if (!ConditionDef)
return true;
if (SingleUseCondition) {
for (auto *Succ : MBB.successors())
if (Succ->isLiveIn(X86::EFLAGS))
SingleUseCondition = false;
}
MBP.ConditionDef = ConditionDef;
MBP.SingleUseCondition = SingleUseCondition;
// Currently we only recognize the simple pattern:
//
// test %reg, %reg
// je %label
//
const unsigned TestOpcode =
Subtarget.is64Bit() ? X86::TEST64rr : X86::TEST32rr;
if (ConditionDef->getOpcode() == TestOpcode &&
ConditionDef->getNumOperands() == 3 &&
ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
(Cond[0].getImm() == X86::COND_NE || Cond[0].getImm() == X86::COND_E)) {
MBP.LHS = ConditionDef->getOperand(0);
MBP.RHS = MachineOperand::CreateImm(0);
MBP.Predicate = Cond[0].getImm() == X86::COND_NE
? MachineBranchPredicate::PRED_NE
: MachineBranchPredicate::PRED_EQ;
return false;
}
return true;
}
unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved) const {
assert(!BytesRemoved && "code size not handled");
MachineBasicBlock::iterator I = MBB.end();
unsigned Count = 0;
while (I != MBB.begin()) {
--I;
if (I->isDebugValue())
continue;
if (I->getOpcode() != X86::JMP_1 &&
getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
break;
// Remove the branch.
I->eraseFromParent();
I = MBB.end();
++Count;
}
return Count;
}
unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
MachineBasicBlock *TBB,
MachineBasicBlock *FBB,
ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded) const {
// Shouldn't be a fall through.
assert(TBB && "insertBranch must not be told to insert a fallthrough");
assert((Cond.size() == 1 || Cond.size() == 0) &&
"X86 branch conditions have one component!");
assert(!BytesAdded && "code size not handled");
if (Cond.empty()) {
// Unconditional branch?
assert(!FBB && "Unconditional branch with multiple successors!");
BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
return 1;
}
// If FBB is null, it is implied to be a fall-through block.
bool FallThru = FBB == nullptr;
// Conditional branch.
unsigned Count = 0;
X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
switch (CC) {
case X86::COND_NE_OR_P:
// Synthesize NE_OR_P with two branches.
BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(TBB);
++Count;
BuildMI(&MBB, DL, get(X86::JP_1)).addMBB(TBB);
++Count;
break;
case X86::COND_E_AND_NP:
// Use the next block of MBB as FBB if it is null.
if (FBB == nullptr) {
FBB = getFallThroughMBB(&MBB, TBB);
assert(FBB && "MBB cannot be the last block in function when the false "
"body is a fall-through.");
}
// Synthesize COND_E_AND_NP with two branches.
BuildMI(&MBB, DL, get(X86::JNE_1)).addMBB(FBB);
++Count;
BuildMI(&MBB, DL, get(X86::JNP_1)).addMBB(TBB);
++Count;
break;
default: {
unsigned Opc = GetCondBranchFromCond(CC);
BuildMI(&MBB, DL, get(Opc)).addMBB(TBB);
++Count;
}
}
if (!FallThru) {
// Two-way Conditional branch. Insert the second branch.
BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
++Count;
}
return Count;
}
bool X86InstrInfo::
canInsertSelect(const MachineBasicBlock &MBB,
ArrayRef<MachineOperand> Cond,
unsigned TrueReg, unsigned FalseReg,
int &CondCycles, int &TrueCycles, int &FalseCycles) const {
// Not all subtargets have cmov instructions.
if (!Subtarget.hasCMov())
return false;
if (Cond.size() != 1)
return false;
// We cannot do the composite conditions, at least not in SSA form.
if ((X86::CondCode)Cond[0].getImm() > X86::COND_S)
return false;
// Check register classes.
const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const TargetRegisterClass *RC =
RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
if (!RC)
return false;
// We have cmov instructions for 16, 32, and 64 bit general purpose registers.
if (X86::GR16RegClass.hasSubClassEq(RC) ||
X86::GR32RegClass.hasSubClassEq(RC) ||
X86::GR64RegClass.hasSubClassEq(RC)) {
// This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
// Bridge. Probably Ivy Bridge as well.
CondCycles = 2;
TrueCycles = 2;
FalseCycles = 2;
return true;
}
// Can't do vectors.
return false;
}
void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, unsigned DstReg,
ArrayRef<MachineOperand> Cond, unsigned TrueReg,
unsigned FalseReg) const {
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
assert(Cond.size() == 1 && "Invalid Cond array");
unsigned Opc = getCMovFromCond((X86::CondCode)Cond[0].getImm(),
MRI.getRegClass(DstReg)->getSize(),
false /*HasMemoryOperand*/);
BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(FalseReg).addReg(TrueReg);
}
/// Test if the given register is a physical h register.
static bool isHReg(unsigned Reg) {
return X86::GR8_ABCD_HRegClass.contains(Reg);
}
// Try and copy between VR128/VR64 and GR64 registers.
static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
const X86Subtarget &Subtarget) {
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
// SrcReg(MaskReg) -> DestReg(GR64)
// SrcReg(MaskReg) -> DestReg(GR32)
// SrcReg(MaskReg) -> DestReg(GR16)
// SrcReg(MaskReg) -> DestReg(GR8)
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
if (X86::VK16RegClass.contains(SrcReg)) {
if (X86::GR64RegClass.contains(DestReg)) {
assert(Subtarget.hasBWI());
return X86::KMOVQrk;
}
if (X86::GR32RegClass.contains(DestReg))
return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
if (X86::GR16RegClass.contains(DestReg)) {
DestReg = getX86SubSuperRegister(DestReg, 32);
return X86::KMOVWrk;
}
if (X86::GR8RegClass.contains(DestReg)) {
DestReg = getX86SubSuperRegister(DestReg, 32);
return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
}
}
// SrcReg(GR64) -> DestReg(MaskReg)
// SrcReg(GR32) -> DestReg(MaskReg)
// SrcReg(GR16) -> DestReg(MaskReg)
// SrcReg(GR8) -> DestReg(MaskReg)
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
if (X86::VK16RegClass.contains(DestReg)) {
if (X86::GR64RegClass.contains(SrcReg)) {
assert(Subtarget.hasBWI());
return X86::KMOVQkr;
}
if (X86::GR32RegClass.contains(SrcReg))
return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
if (X86::GR16RegClass.contains(SrcReg)) {
SrcReg = getX86SubSuperRegister(SrcReg, 32);
return X86::KMOVWkr;
}
if (X86::GR8RegClass.contains(SrcReg)) {
SrcReg = getX86SubSuperRegister(SrcReg, 32);
return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
}
}
// SrcReg(VR128) -> DestReg(GR64)
// SrcReg(VR64) -> DestReg(GR64)
// SrcReg(GR64) -> DestReg(VR128)
// SrcReg(GR64) -> DestReg(VR64)
if (X86::GR64RegClass.contains(DestReg)) {
if (X86::VR128XRegClass.contains(SrcReg))
// Copy from a VR128 register to a GR64 register.
return HasAVX512 ? X86::VMOVPQIto64Zrr :
HasAVX ? X86::VMOVPQIto64rr :
X86::MOVPQIto64rr;
if (X86::VR64RegClass.contains(SrcReg))
// Copy from a VR64 register to a GR64 register.
return X86::MMX_MOVD64from64rr;
} else if (X86::GR64RegClass.contains(SrcReg)) {
// Copy from a GR64 register to a VR128 register.
if (X86::VR128XRegClass.contains(DestReg))
return HasAVX512 ? X86::VMOV64toPQIZrr :
HasAVX ? X86::VMOV64toPQIrr :
X86::MOV64toPQIrr;
// Copy from a GR64 register to a VR64 register.
if (X86::VR64RegClass.contains(DestReg))
return X86::MMX_MOVD64to64rr;
}
// SrcReg(FR32) -> DestReg(GR32)
// SrcReg(GR32) -> DestReg(FR32)
if (X86::GR32RegClass.contains(DestReg) &&
X86::FR32XRegClass.contains(SrcReg))
// Copy from a FR32 register to a GR32 register.
return HasAVX512 ? X86::VMOVSS2DIZrr :
HasAVX ? X86::VMOVSS2DIrr :
X86::MOVSS2DIrr;
if (X86::FR32XRegClass.contains(DestReg) &&
X86::GR32RegClass.contains(SrcReg))
// Copy from a GR32 register to a FR32 register.
return HasAVX512 ? X86::VMOVDI2SSZrr :
HasAVX ? X86::VMOVDI2SSrr :
X86::MOVDI2SSrr;
return 0;
}
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg,
unsigned SrcReg, bool KillSrc) const {
// First deal with the normal symmetric copies.
bool HasAVX = Subtarget.hasAVX();
bool HasVLX = Subtarget.hasVLX();
unsigned Opc = 0;
if (X86::GR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV64rr;
else if (X86::GR32RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV32rr;
else if (X86::GR16RegClass.contains(DestReg, SrcReg))
Opc = X86::MOV16rr;
else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
// Copying to or from a physical H register on x86-64 requires a NOREX
// move. Otherwise use a normal move.
if ((isHReg(DestReg) || isHReg(SrcReg)) &&
Subtarget.is64Bit()) {
Opc = X86::MOV8rr_NOREX;
// Both operands must be encodable without an REX prefix.
assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
"8-bit H register can not be copied outside GR8_NOREX");
} else
Opc = X86::MOV8rr;
}
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
if (HasVLX)
Opc = X86::VMOVAPSZ128rr;
else if (X86::VR128RegClass.contains(DestReg, SrcReg))
Opc = HasAVX ? X86::VMOVAPSrr : X86::MOVAPSrr;
else {
// If this an extended register and we don't have VLX we need to use a
// 512-bit move.
Opc = X86::VMOVAPSZrr;
const TargetRegisterInfo *TRI = &getRegisterInfo();
DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
&X86::VR512RegClass);
SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
&X86::VR512RegClass);
}
} else if (X86::VR256XRegClass.contains(DestReg, SrcReg)) {
if (HasVLX)
Opc = X86::VMOVAPSZ256rr;
else if (X86::VR256RegClass.contains(DestReg, SrcReg))
Opc = X86::VMOVAPSYrr;
else {
// If this an extended register and we don't have VLX we need to use a
// 512-bit move.
Opc = X86::VMOVAPSZrr;
const TargetRegisterInfo *TRI = &getRegisterInfo();
DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
&X86::VR512RegClass);
SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
&X86::VR512RegClass);
}
} else if (X86::VR512RegClass.contains(DestReg, SrcReg))
Opc = X86::VMOVAPSZrr;
// All KMASK RegClasses hold the same k registers, can be tested against anyone.
else if (X86::VK16RegClass.contains(DestReg, SrcReg))
Opc = Subtarget.hasBWI() ? X86::KMOVQkk : X86::KMOVWkk;
if (!Opc)
Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
if (Opc) {
BuildMI(MBB, MI, DL, get(Opc), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
bool FromEFLAGS = SrcReg == X86::EFLAGS;
bool ToEFLAGS = DestReg == X86::EFLAGS;
int Reg = FromEFLAGS ? DestReg : SrcReg;
bool is32 = X86::GR32RegClass.contains(Reg);
bool is64 = X86::GR64RegClass.contains(Reg);
if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) {
int Mov = is64 ? X86::MOV64rr : X86::MOV32rr;
int Push = is64 ? X86::PUSH64r : X86::PUSH32r;
int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32;
int Pop = is64 ? X86::POP64r : X86::POP32r;
int PopF = is64 ? X86::POPF64 : X86::POPF32;
int AX = is64 ? X86::RAX : X86::EAX;
if (!Subtarget.hasLAHFSAHF()) {
assert(Subtarget.is64Bit() &&
"Not having LAHF/SAHF only happens on 64-bit.");
// Moving EFLAGS to / from another register requires a push and a pop.
// Notice that we have to adjust the stack if we don't want to clobber the
// first frame index. See X86FrameLowering.cpp - usesTheStack.
if (FromEFLAGS) {
BuildMI(MBB, MI, DL, get(PushF));
BuildMI(MBB, MI, DL, get(Pop), DestReg);
}
if (ToEFLAGS) {
BuildMI(MBB, MI, DL, get(Push))
.addReg(SrcReg, getKillRegState(KillSrc));
BuildMI(MBB, MI, DL, get(PopF));
}
return;
}
// The flags need to be saved, but saving EFLAGS with PUSHF/POPF is
// inefficient. Instead:
// - Save the overflow flag OF into AL using SETO, and restore it using a
// signed 8-bit addition of AL and INT8_MAX.
// - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH
// using LAHF/SAHF.
// - When RAX/EAX is live and isn't the destination register, make sure it
// isn't clobbered by PUSH/POP'ing it before and after saving/restoring
// the flags.
// This approach is ~2.25x faster than using PUSHF/POPF.
//
// This is still somewhat inefficient because we don't know which flags are
// actually live inside EFLAGS. Were we able to do a single SETcc instead of
// SETO+LAHF / ADDB+SAHF the code could be 1.02x faster.
//
// PUSHF/POPF is also potentially incorrect because it affects other flags
// such as TF/IF/DF, which LLVM doesn't model.
//
// Notice that we have to adjust the stack if we don't want to clobber the
// first frame index.
// See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
const TargetRegisterInfo *TRI = &getRegisterInfo();
MachineBasicBlock::LivenessQueryResult LQR =
MBB.computeRegisterLiveness(TRI, AX, MI);
// We do not want to save and restore AX if we do not have to.
// Moreover, if we do so whereas AX is dead, we would need to set
// an undef flag on the use of AX, otherwise the verifier will
// complain that we read an undef value.
// We do not want to change the behavior of the machine verifier
// as this is usually wrong to read an undef value.
if (MachineBasicBlock::LQR_Unknown == LQR) {
LivePhysRegs LPR(TRI);
LPR.addLiveOuts(MBB);
MachineBasicBlock::iterator I = MBB.end();
while (I != MI) {
--I;
LPR.stepBackward(*I);
}
// AX contains the top most register in the aliasing hierarchy.
// It may not be live, but one of its aliases may be.
for (MCRegAliasIterator AI(AX, TRI, true);
AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
: MachineBasicBlock::LQR_Dead;
}
bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR);
if (!AXDead)
BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true));
if (FromEFLAGS) {
BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL);
BuildMI(MBB, MI, DL, get(X86::LAHF));
BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX);
}
if (ToEFLAGS) {
BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc));
BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL)
.addReg(X86::AL)
.addImm(INT8_MAX);
BuildMI(MBB, MI, DL, get(X86::SAHF));
}
if (!AXDead)
BuildMI(MBB, MI, DL, get(Pop), AX);
return;
}
DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
<< " to " << RI.getName(DestReg) << '\n');
llvm_unreachable("Cannot emit physreg copy instruction");
}
static unsigned getLoadStoreRegOpcode(unsigned Reg,
const TargetRegisterClass *RC,
bool isStackAligned,
const X86Subtarget &STI,
bool load) {
bool HasAVX = STI.hasAVX();
bool HasAVX512 = STI.hasAVX512();
bool HasVLX = STI.hasVLX();
switch (RC->getSize()) {
default:
llvm_unreachable("Unknown spill size");
case 1:
assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
if (STI.is64Bit())
// Copying to or from a physical H register on x86-64 requires a NOREX
// move. Otherwise use a normal move.
if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
return load ? X86::MOV8rm : X86::MOV8mr;
case 2:
if (X86::VK16RegClass.hasSubClassEq(RC))
return load ? X86::KMOVWkm : X86::KMOVWmk;
assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
return load ? X86::MOV16rm : X86::MOV16mr;
case 4:
if (X86::GR32RegClass.hasSubClassEq(RC))
return load ? X86::MOV32rm : X86::MOV32mr;
if (X86::FR32XRegClass.hasSubClassEq(RC))
return load ?
(HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm) :
(HasAVX512 ? X86::VMOVSSZmr : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
if (X86::RFP32RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp32m : X86::ST_Fp32m;
if (X86::VK32RegClass.hasSubClassEq(RC))
return load ? X86::KMOVDkm : X86::KMOVDmk;
llvm_unreachable("Unknown 4-byte regclass");
case 8:
if (X86::GR64RegClass.hasSubClassEq(RC))
return load ? X86::MOV64rm : X86::MOV64mr;
if (X86::FR64XRegClass.hasSubClassEq(RC))
return load ?
(HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm) :
(HasAVX512 ? X86::VMOVSDZmr : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
if (X86::VR64RegClass.hasSubClassEq(RC))
return load ? X86::MMX_MOVQ64rm : X86::MMX_MOVQ64mr;
if (X86::RFP64RegClass.hasSubClassEq(RC))
return load ? X86::LD_Fp64m : X86::ST_Fp64m;
if (X86::VK64RegClass.hasSubClassEq(RC))
return load ? X86::KMOVQkm : X86::KMOVQmk;
llvm_unreachable("Unknown 8-byte regclass");
case 10:
assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
return load ? X86::LD_Fp80m : X86::ST_FpP80m;
case 16: {
assert(X86::VR128XRegClass.hasSubClassEq(RC) && "Unknown 16-byte regclass");
// If stack is realigned we can use aligned stores.
if (isStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ128rm :
HasAVX512 ? X86::VMOVAPSZ128rm_NOVLX :
HasAVX ? X86::VMOVAPSrm :
X86::MOVAPSrm):
(HasVLX ? X86::VMOVAPSZ128mr :
HasAVX512 ? X86::VMOVAPSZ128mr_NOVLX :
HasAVX ? X86::VMOVAPSmr :
X86::MOVAPSmr);
else
return load ?
(HasVLX ? X86::VMOVUPSZ128rm :
HasAVX512 ? X86::VMOVUPSZ128rm_NOVLX :
HasAVX ? X86::VMOVUPSrm :
X86::MOVUPSrm):
(HasVLX ? X86::VMOVUPSZ128mr :
HasAVX512 ? X86::VMOVUPSZ128mr_NOVLX :
HasAVX ? X86::VMOVUPSmr :
X86::MOVUPSmr);
}
case 32:
assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
// If stack is realigned we can use aligned stores.
if (isStackAligned)
return load ?
(HasVLX ? X86::VMOVAPSZ256rm :
HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX :
X86::VMOVAPSYrm) :
(HasVLX ? X86::VMOVAPSZ256mr :
HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX :
X86::VMOVAPSYmr);
else
return load ?
(HasVLX ? X86::VMOVUPSZ256rm :
HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX :
X86::VMOVUPSYrm) :
(HasVLX ? X86::VMOVUPSZ256mr :
HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX :
X86::VMOVUPSYmr);
case 64:
assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
if (isStackAligned)
return load ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
else
return load ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
}
}
bool X86InstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
const MCInstrDesc &Desc = MemOp.getDesc();
int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
if (MemRefBegin < 0)
return false;
MemRefBegin += X86II::getOperandBias(Desc);
MachineOperand &BaseMO = MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
if (!BaseMO.isReg()) // Can be an MO_FrameIndex
return false;
BaseReg = BaseMO.getReg();
if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
return false;
if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
X86::NoRegister)
return false;
const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
// Displacement can be symbolic
if (!DispMO.isImm())
return false;
Offset = DispMO.getImm();
return true;
}
static unsigned getStoreRegOpcode(unsigned SrcReg,
const TargetRegisterClass *RC,
bool isStackAligned,
const X86Subtarget &STI) {
return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
}
static unsigned getLoadRegOpcode(unsigned DestReg,
const TargetRegisterClass *RC,
bool isStackAligned,
const X86Subtarget &STI) {
return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
}
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= RC->getSize() &&
"Stack slot too small for store");
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
DebugLoc DL = MBB.findDebugLoc(MI);
addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
.addReg(SrcReg, getKillRegState(isKill));
}
void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
bool isKill,
SmallVectorImpl<MachineOperand> &Addr,
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
MIB.addOperand(Addr[i]);
MIB.addReg(SrcReg, getKillRegState(isKill));
(*MIB).setMemRefs(MMOBegin, MMOEnd);
NewMIs.push_back(MIB);
}
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIdx,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const {
const MachineFunction &MF = *MBB.getParent();
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned =
(Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
RI.canRealignStack(MF);
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
DebugLoc DL = MBB.findDebugLoc(MI);
addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
}
void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
SmallVectorImpl<MachineOperand> &Addr,
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const {
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = MMOBegin != MMOEnd &&
(*MMOBegin)->getAlignment() >= Alignment;
unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
DebugLoc DL;
MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
for (unsigned i = 0, e = Addr.size(); i != e; ++i)
MIB.addOperand(Addr[i]);
(*MIB).setMemRefs(MMOBegin, MMOEnd);
NewMIs.push_back(MIB);
}
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const {
switch (MI.getOpcode()) {
default: break;
case X86::CMP64ri32:
case X86::CMP64ri8:
case X86::CMP32ri:
case X86::CMP32ri8:
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP8ri:
if (!MI.getOperand(1).isImm())
return false;
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = MI.getOperand(1).getImm();
return true;
// A SUB can be used to perform comparison.
case X86::SUB64rm:
case X86::SUB32rm:
case X86::SUB16rm:
case X86::SUB8rm:
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = 0;
return true;
case X86::SUB64rr:
case X86::SUB32rr:
case X86::SUB16rr:
case X86::SUB8rr:
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = MI.getOperand(2).getReg();
CmpMask = ~0;
CmpValue = 0;
return true;
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
if (!MI.getOperand(2).isImm())
return false;
SrcReg = MI.getOperand(1).getReg();
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = MI.getOperand(2).getImm();
return true;
case X86::CMP64rr:
case X86::CMP32rr:
case X86::CMP16rr:
case X86::CMP8rr:
SrcReg = MI.getOperand(0).getReg();
SrcReg2 = MI.getOperand(1).getReg();
CmpMask = ~0;
CmpValue = 0;
return true;
case X86::TEST8rr:
case X86::TEST16rr:
case X86::TEST32rr:
case X86::TEST64rr:
SrcReg = MI.getOperand(0).getReg();
if (MI.getOperand(1).getReg() != SrcReg)
return false;
// Compare against zero.
SrcReg2 = 0;
CmpMask = ~0;
CmpValue = 0;
return true;
}
return false;
}
/// Check whether the first instruction, whose only
/// purpose is to update flags, can be made redundant.
/// CMPrr can be made redundant by SUBrr if the operands are the same.
/// This function can be extended later on.
/// SrcReg, SrcRegs: register operands for FlagI.
/// ImmValue: immediate for FlagI if it takes an immediate.
inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
unsigned SrcReg2, int ImmValue,
MachineInstr &OI) {
if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
(FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
(FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
(FlagI.getOpcode() == X86::CMP8rr && OI.getOpcode() == X86::SUB8rr)) &&
((OI.getOperand(1).getReg() == SrcReg &&
OI.getOperand(2).getReg() == SrcReg2) ||
(OI.getOperand(1).getReg() == SrcReg2 &&
OI.getOperand(2).getReg() == SrcReg)))
return true;
if (((FlagI.getOpcode() == X86::CMP64ri32 &&
OI.getOpcode() == X86::SUB64ri32) ||
(FlagI.getOpcode() == X86::CMP64ri8 &&
OI.getOpcode() == X86::SUB64ri8) ||
(FlagI.getOpcode() == X86::CMP32ri && OI.getOpcode() == X86::SUB32ri) ||
(FlagI.getOpcode() == X86::CMP32ri8 &&
OI.getOpcode() == X86::SUB32ri8) ||
(FlagI.getOpcode() == X86::CMP16ri && OI.getOpcode() == X86::SUB16ri) ||
(FlagI.getOpcode() == X86::CMP16ri8 &&
OI.getOpcode() == X86::SUB16ri8) ||
(FlagI.getOpcode() == X86::CMP8ri && OI.getOpcode() == X86::SUB8ri)) &&
OI.getOperand(1).getReg() == SrcReg &&
OI.getOperand(2).getImm() == ImmValue)
return true;
return false;
}
/// Check whether the definition can be converted
/// to remove a comparison against zero.
inline static bool isDefConvertible(MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return false;
// The shift instructions only modify ZF if their shift count is non-zero.
// N.B.: The processor truncates the shift count depending on the encoding.
case X86::SAR8ri: case X86::SAR16ri: case X86::SAR32ri:case X86::SAR64ri:
case X86::SHR8ri: case X86::SHR16ri: case X86::SHR32ri:case X86::SHR64ri:
return getTruncatedShiftCount(MI, 2) != 0;
// Some left shift instructions can be turned into LEA instructions but only
// if their flags aren't used. Avoid transforming such instructions.
case X86::SHL8ri: case X86::SHL16ri: case X86::SHL32ri:case X86::SHL64ri:{
unsigned ShAmt = getTruncatedShiftCount(MI, 2);
if (isTruncatedShiftCountForLEA(ShAmt)) return false;
return ShAmt != 0;
}
case X86::SHRD16rri8:case X86::SHRD32rri8:case X86::SHRD64rri8:
case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
return getTruncatedShiftCount(MI, 3) != 0;
case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
case X86::SUB32ri8: case X86::SUB16ri: case X86::SUB16ri8:
case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
case X86::AND16rr: case X86::AND8rr: case X86::AND64rm:
case X86::AND32rm: case X86::AND16rm: case X86::AND8rm:
case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
case X86::XOR32ri8: case X86::XOR16ri: case X86::XOR16ri8:
case X86::XOR8ri: case X86::XOR64rr: case X86::XOR32rr:
case X86::XOR16rr: case X86::XOR8rr: case X86::XOR64rm:
case X86::XOR32rm: case X86::XOR16rm: case X86::XOR8rm:
case X86::OR64ri32: case X86::OR64ri8: case X86::OR32ri:
case X86::OR32ri8: case X86::OR16ri: case X86::OR16ri8:
case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
case X86::ADC32ri: case X86::ADC32ri8:
case X86::ADC32rr: case X86::ADC64ri32:
case X86::ADC64ri8: case X86::ADC64rr:
case X86::SBB32ri: case X86::SBB32ri8:
case X86::SBB32rr: case X86::SBB64ri32:
case X86::SBB64ri8: case X86::SBB64rr:
case X86::ANDN32rr: case X86::ANDN32rm:
case X86::ANDN64rr: case X86::ANDN64rm:
case X86::BEXTR32rr: case X86::BEXTR64rr:
case X86::BEXTR32rm: case X86::BEXTR64rm:
case X86::BLSI32rr: case X86::BLSI32rm:
case X86::BLSI64rr: case X86::BLSI64rm:
case X86::BLSMSK32rr:case X86::BLSMSK32rm:
case X86::BLSMSK64rr:case X86::BLSMSK64rm:
case X86::BLSR32rr: case X86::BLSR32rm:
case X86::BLSR64rr: case X86::BLSR64rm:
case X86::BZHI32rr: case X86::BZHI32rm:
case X86::BZHI64rr: case X86::BZHI64rm:
case X86::LZCNT16rr: case X86::LZCNT16rm:
case X86::LZCNT32rr: case X86::LZCNT32rm:
case X86::LZCNT64rr: case X86::LZCNT64rm:
case X86::POPCNT16rr:case X86::POPCNT16rm:
case X86::POPCNT32rr:case X86::POPCNT32rm:
case X86::POPCNT64rr:case X86::POPCNT64rm:
case X86::TZCNT16rr: case X86::TZCNT16rm:
case X86::TZCNT32rr: case X86::TZCNT32rm:
case X86::TZCNT64rr: case X86::TZCNT64rm:
return true;
}
}
/// Check whether the use can be converted to remove a comparison against zero.
static X86::CondCode isUseDefConvertible(MachineInstr &MI) {
switch (MI.getOpcode()) {
default: return X86::COND_INVALID;
case X86::LZCNT16rr: case X86::LZCNT16rm:
case X86::LZCNT32rr: case X86::LZCNT32rm:
case X86::LZCNT64rr: case X86::LZCNT64rm:
return X86::COND_B;
case X86::POPCNT16rr:case X86::POPCNT16rm:
case X86::POPCNT32rr:case X86::POPCNT32rm:
case X86::POPCNT64rr:case X86::POPCNT64rm:
return X86::COND_E;
case X86::TZCNT16rr: case X86::TZCNT16rm:
case X86::TZCNT32rr: case X86::TZCNT32rm:
case X86::TZCNT64rr: case X86::TZCNT64rm:
return X86::COND_B;
}
}
/// Check if there exists an earlier instruction that
/// operates on the same source operands and sets flags in the same way as
/// Compare; remove Compare if possible.
bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int CmpMask,
int CmpValue,
const MachineRegisterInfo *MRI) const {
// Check whether we can replace SUB with CMP.
unsigned NewOpcode = 0;
switch (CmpInstr.getOpcode()) {
default: break;
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB8ri:
case X86::SUB64rm:
case X86::SUB32rm:
case X86::SUB16rm:
case X86::SUB8rm:
case X86::SUB64rr:
case X86::SUB32rr:
case X86::SUB16rr:
case X86::SUB8rr: {
if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
return false;
// There is no use of the destination register, we can replace SUB with CMP.
switch (CmpInstr.getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break;
case X86::SUB32rm: NewOpcode = X86::CMP32rm; break;
case X86::SUB16rm: NewOpcode = X86::CMP16rm; break;
case X86::SUB8rm: NewOpcode = X86::CMP8rm; break;
case X86::SUB64rr: NewOpcode = X86::CMP64rr; break;
case X86::SUB32rr: NewOpcode = X86::CMP32rr; break;
case X86::SUB16rr: NewOpcode = X86::CMP16rr; break;
case X86::SUB8rr: NewOpcode = X86::CMP8rr; break;
case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break;
case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break;
case X86::SUB32ri: NewOpcode = X86::CMP32ri; break;
case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break;
case X86::SUB16ri: NewOpcode = X86::CMP16ri; break;
case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break;
case X86::SUB8ri: NewOpcode = X86::CMP8ri; break;
}
CmpInstr.setDesc(get(NewOpcode));
CmpInstr.RemoveOperand(0);
// Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
if (NewOpcode == X86::CMP64rm || NewOpcode == X86::CMP32rm ||
NewOpcode == X86::CMP16rm || NewOpcode == X86::CMP8rm)
return false;
}
}
// Get the unique definition of SrcReg.
MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
if (!MI) return false;
// CmpInstr is the first instruction of the BB.
MachineBasicBlock::iterator I = CmpInstr, Def = MI;
// If we are comparing against zero, check whether we can use MI to update
// EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
return false;
// If we have a use of the source register between the def and our compare
// instruction we can eliminate the compare iff the use sets EFLAGS in the
// right way.
bool ShouldUpdateCC = false;
X86::CondCode NewCC = X86::COND_INVALID;
if (IsCmpZero && !isDefConvertible(*MI)) {
// Scan forward from the use until we hit the use we're looking for or the
// compare instruction.
for (MachineBasicBlock::iterator J = MI;; ++J) {
// Do we have a convertible instruction?
NewCC = isUseDefConvertible(*J);
if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
J->getOperand(1).getReg() == SrcReg) {
assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
ShouldUpdateCC = true; // Update CC later on.
// This is not a def of SrcReg, but still a def of EFLAGS. Keep going
// with the new def.
Def = J;
MI = &*Def;
break;
}
if (J == I)
return false;
}
}
// We are searching for an earlier instruction that can make CmpInstr
// redundant and that instruction will be saved in Sub.
MachineInstr *Sub = nullptr;
const TargetRegisterInfo *TRI = &getRegisterInfo();
// We iterate backward, starting from the instruction before CmpInstr and
// stop when reaching the definition of a source register or done with the BB.
// RI points to the instruction before CmpInstr.
// If the definition is in this basic block, RE points to the definition;
// otherwise, RE is the rend of the basic block.
MachineBasicBlock::reverse_iterator
RI = ++I.getReverse(),
RE = CmpInstr.getParent() == MI->getParent()
? Def.getReverse() /* points to MI */
: CmpInstr.getParent()->rend();
MachineInstr *Movr0Inst = nullptr;
for (; RI != RE; ++RI) {
MachineInstr &Instr = *RI;
// Check whether CmpInstr can be made redundant by the current instruction.
if (!IsCmpZero &&
isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
Sub = &Instr;
break;
}
if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
Instr.readsRegister(X86::EFLAGS, TRI)) {
// This instruction modifies or uses EFLAGS.
// MOV32r0 etc. are implemented with xor which clobbers condition code.
// They are safe to move up, if the definition to EFLAGS is dead and
// earlier instructions do not read or write EFLAGS.
if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 &&
Instr.registerDefIsDead(X86::EFLAGS, TRI)) {
Movr0Inst = &Instr;
continue;
}
// We can't remove CmpInstr.
return false;
}
}
// Return false if no candidates exist.
if (!IsCmpZero && !Sub)
return false;
bool IsSwapped = (SrcReg2 != 0 && Sub->getOperand(1).getReg() == SrcReg2 &&
Sub->getOperand(2).getReg() == SrcReg);
// Scan forward from the instruction after CmpInstr for uses of EFLAGS.
// It is safe to remove CmpInstr if EFLAGS is redefined or killed.
// If we are done with the basic block, we need to check whether EFLAGS is
// live-out.
bool IsSafe = false;
SmallVector<std::pair<MachineInstr*, unsigned /*NewOpc*/>, 4> OpsToUpdate;
MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
for (++I; I != E; ++I) {
const MachineInstr &Instr = *I;
bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
// We should check the usage if this instruction uses and updates EFLAGS.
if (!UseEFLAGS && ModifyEFLAGS) {
// It is safe to remove CmpInstr if EFLAGS is updated again.
IsSafe = true;
break;
}
if (!UseEFLAGS && !ModifyEFLAGS)
continue;
// EFLAGS is used by this instruction.
X86::CondCode OldCC = X86::COND_INVALID;
bool OpcIsSET = false;
if (IsCmpZero || IsSwapped) {
// We decode the condition code from opcode.
if (Instr.isBranch())
OldCC = getCondFromBranchOpc(Instr.getOpcode());
else {
OldCC = getCondFromSETOpc(Instr.getOpcode());
if (OldCC != X86::COND_INVALID)
OpcIsSET = true;
else
OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
}
if (OldCC == X86::COND_INVALID) return false;
}
if (IsCmpZero) {
switch (OldCC) {
default: break;
case X86::COND_A: case X86::COND_AE:
case X86::COND_B: case X86::COND_BE:
case X86::COND_G: case X86::COND_GE:
case X86::COND_L: case X86::COND_LE:
case X86::COND_O: case X86::COND_NO:
// CF and OF are used, we can't perform this optimization.
return false;
}
// If we're updating the condition code check if we have to reverse the
// condition.
if (ShouldUpdateCC)
switch (OldCC) {
default:
return false;
case X86::COND_E:
break;
case X86::COND_NE:
NewCC = GetOppositeBranchCondition(NewCC);
break;
}
} else if (IsSwapped) {
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
// to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
// We swap the condition code and synthesize the new opcode.
NewCC = getSwappedCondition(OldCC);
if (NewCC == X86::COND_INVALID) return false;
}
if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
// Synthesize the new opcode.
bool HasMemoryOperand = Instr.hasOneMemOperand();
unsigned NewOpc;
if (Instr.isBranch())
NewOpc = GetCondBranchFromCond(NewCC);
else if(OpcIsSET)
NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
else {
unsigned DstReg = Instr.getOperand(0).getReg();
NewOpc = getCMovFromCond(NewCC, MRI->getRegClass(DstReg)->getSize(),
HasMemoryOperand);
}
// Push the MachineInstr to OpsToUpdate.
// If it is safe to remove CmpInstr, the condition code of these
// instructions will be modified.
OpsToUpdate.push_back(std::make_pair(&*I, NewOpc));
}
if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
// It is safe to remove CmpInstr if EFLAGS is updated again or killed.
IsSafe = true;
break;
}
}
// If EFLAGS is not killed nor re-defined, we should check whether it is
// live-out. If it is live-out, do not optimize.
if ((IsCmpZero || IsSwapped) && !IsSafe) {
MachineBasicBlock *MBB = CmpInstr.getParent();
for (MachineBasicBlock *Successor : MBB->successors())
if (Successor->isLiveIn(X86::EFLAGS))
return false;
}
// The instruction to be updated is either Sub or MI.
Sub = IsCmpZero ? MI : Sub;
// Move Movr0Inst to the appropriate place before Sub.
if (Movr0Inst) {
// Look backwards until we find a def that doesn't use the current EFLAGS.
Def = Sub;
MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
InsertE = Sub->getParent()->rend();
for (; InsertI != InsertE; ++InsertI) {
MachineInstr *Instr = &*InsertI;
if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
Instr->modifiesRegister(X86::EFLAGS, TRI)) {
Sub->getParent()->remove(Movr0Inst);
Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
Movr0Inst);
break;
}
}
if (InsertI == InsertE)
return false;
}
// Make sure Sub instruction defines EFLAGS and mark the def live.
unsigned i = 0, e = Sub->getNumOperands();
for (; i != e; ++i) {
MachineOperand &MO = Sub->getOperand(i);
if (MO.isReg() && MO.isDef() && MO.getReg() == X86::EFLAGS) {
MO.setIsDead(false);
break;
}
}
assert(i != e && "Unable to locate a def EFLAGS operand");
CmpInstr.eraseFromParent();
// Modify the condition code of instructions in OpsToUpdate.
for (auto &Op : OpsToUpdate)
Op.first->setDesc(get(Op.second));
return true;
}
/// Try to remove the load by folding it to a register
/// operand at the use. We fold the load instructions if load defines a virtual
/// register, the virtual register is used once in the same BB, and the
/// instructions in-between do not load or store, and have no side effects.
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const {
// Check whether we can move DefMI here.
DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
assert(DefMI);
bool SawStore = false;
if (!DefMI->isSafeToMove(nullptr, SawStore))
return nullptr;
// Collect information about virtual register operands of MI.
SmallVector<unsigned, 1> SrcOperandIds;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (!MO.isReg())
continue;
unsigned Reg = MO.getReg();
if (Reg != FoldAsLoadDefReg)
continue;
// Do not fold if we have a subreg use or a def.
if (MO.getSubReg() || MO.isDef())
return nullptr;
SrcOperandIds.push_back(i);
}
if (SrcOperandIds.empty())
return nullptr;
// Check whether we can fold the def into SrcOperandId.
if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
FoldAsLoadDefReg = 0;
return FoldMI;
}
return nullptr;
}
/// Expand a single-def pseudo instruction to a two-addr
/// instruction with two undef reads of the register being defined.
/// This is used for mapping:
/// %xmm4 = V_SET0
/// to:
/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
///
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(Desc);
// MachineInstr::addOperand() will insert explicit operands before any
// implicit operands.
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
// But we don't trust that.
assert(MIB->getOperand(1).getReg() == Reg &&
MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
return true;
}
/// Expand a single-def pseudo instruction to a two-addr
/// instruction with two %k0 reads.
/// This is used for mapping:
/// %k4 = K_SET1
/// to:
/// %k4 = KXNORrr %k0, %k0
static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc, unsigned Reg) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
MIB->setDesc(Desc);
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
return true;
}
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
bool MinusOne) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
unsigned Reg = MIB->getOperand(0).getReg();
// Insert the XOR.
BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
// Turn the pseudo into an INC or DEC.
MIB->setDesc(TII.get(MinusOne ? X86::DEC32r : X86::INC32r));
MIB.addReg(Reg);
return true;
}
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII,
const X86Subtarget &Subtarget) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
int64_t Imm = MIB->getOperand(1).getImm();
assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
MachineBasicBlock::iterator I = MIB.getInstr();
int StackAdjustment;
if (Subtarget.is64Bit()) {
assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
MIB->getOpcode() == X86::MOV32ImmSExti8);
// Can't use push/pop lowering if the function might write to the red zone.
X86MachineFunctionInfo *X86FI =
MBB.getParent()->getInfo<X86MachineFunctionInfo>();
if (X86FI->getUsesRedZone()) {
MIB->setDesc(TII.get(MIB->getOpcode() ==
X86::MOV32ImmSExti8 ? X86::MOV32ri : X86::MOV64ri));
return true;
}
// 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
// widen the register if necessary.
StackAdjustment = 8;
BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
MIB->setDesc(TII.get(X86::POP64r));
MIB->getOperand(0)
.setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
} else {
assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
StackAdjustment = 4;
BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
MIB->setDesc(TII.get(X86::POP32r));
}
// Build CFI if necessary.
MachineFunction &MF = *MBB.getParent();
const X86FrameLowering *TFL = Subtarget.getFrameLowering();
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsDwarfCFI =
!IsWin64Prologue &&
(MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
if (EmitCFI) {
TFL->BuildCFI(MBB, I, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
TFL->BuildCFI(MBB, std::next(I), DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
}
return true;
}
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
// code sequence is needed for other targets.
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
const TargetInstrInfo &TII) {
MachineBasicBlock &MBB = *MIB->getParent();
DebugLoc DL = MIB->getDebugLoc();
unsigned Reg = MIB->getOperand(0).getReg();
const GlobalValue *GV =
cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
auto Flags = MachineMemOperand::MOLoad |
MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant;
MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
MachineBasicBlock::iterator I = MIB.getInstr();
BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
.addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
.addMemOperand(MMO);
MIB->setDebugLoc(DL);
MIB->setDesc(TII.get(X86::MOV64rm));
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
}
// This is used to handle spills for 128/256-bit registers when we have AVX512,
// but not VLX. If it uses an extended register we need to use an instruction
// that loads the lower 128/256-bit, but is available with only AVX512F.
static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
const TargetRegisterInfo *TRI,
const MCInstrDesc &LoadDesc,
const MCInstrDesc &BroadcastDesc,
unsigned SubIdx) {
unsigned DestReg = MIB->getOperand(0).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(DestReg) < 16) {
// We can use a normal VEX encoded load.
MIB->setDesc(LoadDesc);
} else {
// Use a 128/256-bit VBROADCAST instruction.
MIB->setDesc(BroadcastDesc);
// Change the destination to a 512-bit register.
DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
MIB->getOperand(0).setReg(DestReg);
}
return true;
}
// This is used to handle spills for 128/256-bit registers when we have AVX512,
// but not VLX. If it uses an extended register we need to use an instruction
// that stores the lower 128/256-bit, but is available with only AVX512F.
static bool expandNOVLXStore(MachineInstrBuilder &MIB,
const TargetRegisterInfo *TRI,
const MCInstrDesc &StoreDesc,
const MCInstrDesc &ExtractDesc,
unsigned SubIdx) {
unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
// Check if DestReg is XMM16-31 or YMM16-31.
if (TRI->getEncodingValue(SrcReg) < 16) {
// We can use a normal VEX encoded store.
MIB->setDesc(StoreDesc);
} else {
// Use a VEXTRACTF instruction.
MIB->setDesc(ExtractDesc);
// Change the destination to a 512-bit register.
SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
MIB.addImm(0x0); // Append immediate to extract from the lower bits.
}
return true;
}
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
bool HasAVX = Subtarget.hasAVX();
MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
switch (MI.getOpcode()) {
case X86::MOV32r0:
return Expand2AddrUndef(MIB, get(X86::XOR32rr));
case X86::MOV32r1:
return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
case X86::MOV32r_1:
return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
case X86::MOV32ImmSExti8:
case X86::MOV64ImmSExti8:
return ExpandMOVImmSExti8(MIB, *this, Subtarget);
case X86::SETB_C8r:
return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
return Expand2AddrUndef(MIB, get(X86::SBB16rr));
case X86::SETB_C32r:
return Expand2AddrUndef(MIB, get(X86::SBB32rr));
case X86::SETB_C64r:
return Expand2AddrUndef(MIB, get(X86::SBB64rr));
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0:
assert(HasAVX && "AVX not supported");
return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
case X86::AVX512_128_SET0:
return Expand2AddrUndef(MIB, get(X86::VPXORDZ128rr));
case X86::AVX512_256_SET0:
return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr));
case X86::AVX512_512_SET0:
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
case X86::AVX512_FsFLD0SS:
case X86::AVX512_FsFLD0SD:
return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr));
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::AVX512_512_SETALLONES: {
unsigned Reg = MIB->getOperand(0).getReg();
MIB->setDesc(get(X86::VPTERNLOGDZrri));
// VPTERNLOGD needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
case X86::AVX512_512_SEXT_MASK_32:
case X86::AVX512_512_SEXT_MASK_64: {
unsigned Reg = MIB->getOperand(0).getReg();
unsigned MaskReg = MIB->getOperand(1).getReg();
unsigned MaskState = getRegState(MIB->getOperand(1));
unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
MI.RemoveOperand(1);
MIB->setDesc(get(Opc));
// VPTERNLOG needs 3 register inputs and an immediate.
// 0xff will return 1s for any input.
MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
return true;
}
case X86::VMOVAPSZ128rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
case X86::VMOVUPSZ128rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
case X86::VMOVAPSZ256rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
case X86::VMOVUPSZ256rm_NOVLX:
return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
case X86::VMOVAPSZ128mr_NOVLX:
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
case X86::VMOVUPSZ128mr_NOVLX:
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
case X86::VMOVAPSZ256mr_NOVLX:
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::VMOVUPSZ256mr_NOVLX:
return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
case X86::TEST8ri_NOREX:
MI.setDesc(get(X86::TEST8ri));
return true;
case X86::MOV32ri64:
MI.setDesc(get(X86::MOV32ri));
return true;
// KNL does not recognize dependency-breaking idioms for mask registers,
// so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
// Using %k0 as the undef input register is a performance heuristic based
// on the assumption that %k0 is used less frequently than the other mask
// registers, since it is not usable as a write mask.
// FIXME: A more advanced approach would be to choose the best input mask
// register based on context.
case X86::KSET0B:
case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0);
case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0);
case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0);
case X86::KSET1B:
case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0);
case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0);
case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0);
case TargetOpcode::LOAD_STACK_GUARD:
expandLoadStackGuard(MIB, *this);
return true;
}
return false;
}
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();
if (NumAddrOps < 4) {
// FrameIndex only - add an immediate offset (whether its zero or not).
for (unsigned i = 0; i != NumAddrOps; ++i)
MIB.addOperand(MOs[i]);
addOffset(MIB, PtrOffset);
} else {
// General Memory Addressing - we need to add any offset to an existing
// offset.
assert(MOs.size() == 5 && "Unexpected memory operand list length");
for (unsigned i = 0; i != NumAddrOps; ++i) {
const MachineOperand &MO = MOs[i];
if (i == 3 && PtrOffset != 0) {
MIB.addDisp(MO, PtrOffset);
} else {
MIB.addOperand(MO);
}
}
}
}
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
MachineInstr &MI,
const TargetInstrInfo &TII) {
// Create the base instruction with the memory operand as the first part.
// Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI =
MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
MachineInstrBuilder MIB(MF, NewMI);
addOperands(MIB, MOs);
// Loop over the rest of the ri operands, converting them over.
unsigned NumOps = MI.getDesc().getNumOperands() - 2;
for (unsigned i = 0; i != NumOps; ++i) {
MachineOperand &MO = MI.getOperand(i + 2);
MIB.addOperand(MO);
}
for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
MIB.addOperand(MO);
}
MachineBasicBlock *MBB = InsertPt->getParent();
MBB->insert(InsertPt, NewMI);
return MIB;
}
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
unsigned OpNo, ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
MachineInstr &MI, const TargetInstrInfo &TII,
int PtrOffset = 0) {
// Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI =
MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
MachineInstrBuilder MIB(MF, NewMI);
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
if (i == OpNo) {
assert(MO.isReg() && "Expected to fold into reg operand!");
addOperands(MIB, MOs, PtrOffset);
} else {
MIB.addOperand(MO);
}
}
MachineBasicBlock *MBB = InsertPt->getParent();
MBB->insert(InsertPt, NewMI);
return MIB;
}
static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
MachineInstr &MI) {
MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
MI.getDebugLoc(), TII.get(Opcode));
addOperands(MIB, MOs);
return MIB.addImm(0);
}
MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
unsigned Size, unsigned Align) const {
switch (MI.getOpcode()) {
case X86::INSERTPSrr:
case X86::VINSERTPSrr:
case X86::VINSERTPSZrr:
// Attempt to convert the load of inserted vector into a fold load
// of a single float.
if (OpNum == 2) {
unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
unsigned ZMask = Imm & 15;
unsigned DstIdx = (Imm >> 4) & 3;
unsigned SrcIdx = (Imm >> 6) & 3;
unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
if (Size <= RCSize && 4 <= Align) {
int PtrOffset = SrcIdx * 4;
unsigned NewImm = (DstIdx << 4) | ZMask;
unsigned NewOpCode =
(MI.getOpcode() == X86::VINSERTPSZrr) ? X86::VINSERTPSZrm :
(MI.getOpcode() == X86::VINSERTPSrr) ? X86::VINSERTPSrm :
X86::INSERTPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
return NewMI;
}
}
break;
case X86::MOVHLPSrr:
case X86::VMOVHLPSrr:
case X86::VMOVHLPSZrr:
// Move the upper 64-bits of the second operand to the lower 64-bits.
// To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
// TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
if (OpNum == 2) {
unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
if (Size <= RCSize && 8 <= Align) {
unsigned NewOpCode =
(MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
(MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm :
X86::MOVLPSrm;
MachineInstr *NewMI =
FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
return NewMI;
}
}
break;
};
return nullptr;
}
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
unsigned Size, unsigned Align, bool AllowCommute) const {
const DenseMap<unsigned,
std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
bool isCallRegIndirect = Subtarget.callRegIndirect();
bool isTwoAddrFold = false;
// For CPUs that favor the register form of a call or push,
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
(MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
MI.getOpcode() == X86::PUSH64r))
return nullptr;
unsigned NumOps = MI.getDesc().getNumOperands();
bool isTwoAddr =
NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
// FIXME: AsmPrinter doesn't know how to handle
// X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
if (MI.getOpcode() == X86::ADD32ri &&
MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
return nullptr;
MachineInstr *NewMI = nullptr;
// Attempt to fold any custom cases we have.
if (MachineInstr *CustomMI =
foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
return CustomMI;
// Folding a memory location into the two-address part of a two-address
// instruction is different than folding it other places. It requires
// replacing the *two* registers with the memory location.
if (isTwoAddr && NumOps >= 2 && OpNum < 2 && MI.getOperand(0).isReg() &&
MI.getOperand(1).isReg() &&
MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
OpcodeTablePtr = &RegOp2MemOpTable2Addr;
isTwoAddrFold = true;
} else if (OpNum == 0) {
if (MI.getOpcode() == X86::MOV32r0) {
NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
if (NewMI)
return NewMI;
}
OpcodeTablePtr = &RegOp2MemOpTable0;
} else if (OpNum == 1) {
OpcodeTablePtr = &RegOp2MemOpTable1;
} else if (OpNum == 2) {
OpcodeTablePtr = &RegOp2MemOpTable2;
} else if (OpNum == 3) {
OpcodeTablePtr = &RegOp2MemOpTable3;
} else if (OpNum == 4) {
OpcodeTablePtr = &RegOp2MemOpTable4;
}
// If table selected...
if (OpcodeTablePtr) {
// Find the Opcode to fuse
auto I = OpcodeTablePtr->find(MI.getOpcode());
if (I != OpcodeTablePtr->end()) {
unsigned Opcode = I->second.first;
unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
if (Align < MinAlign)
return nullptr;
bool NarrowToMOV32rm = false;
if (Size) {
unsigned RCSize = getRegClass(MI.getDesc(), OpNum, &RI, MF)->getSize();
if (Size < RCSize) {
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
return nullptr;
// If this is a 64-bit load, but the spill slot is 32, then we can do
// a 32-bit load which is implicitly zero-extended. This likely is
// due to live interval analysis remat'ing a load from stack slot.
if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
return nullptr;
Opcode = X86::MOV32rm;
NarrowToMOV32rm = true;
}
}
if (isTwoAddrFold)
NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
else
NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
if (NarrowToMOV32rm) {
// If this is the special case where we use a MOV32rm to load a 32-bit
// value and zero-extend the top bits. Change the destination register
// to a 32-bit one.
unsigned DstReg = NewMI->getOperand(0).getReg();
if (TargetRegisterInfo::isPhysicalRegister(DstReg))
NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
else
NewMI->getOperand(0).setSubReg(X86::sub_32bit);
}
return NewMI;
}
}
// If the instruction and target operand are commutable, commute the
// instruction and try again.
if (AllowCommute) {
unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
bool HasDef = MI.getDesc().getNumDefs();
unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
unsigned Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
unsigned Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
bool Tied1 =
0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
bool Tied2 =
0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
// If either of the commutable operands are tied to the destination
// then we can not commute + fold.
if ((HasDef && Reg0 == Reg1 && Tied1) ||
(HasDef && Reg0 == Reg2 && Tied2))
return nullptr;
MachineInstr *CommutedMI =
commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
if (!CommutedMI) {
// Unable to commute.
return nullptr;
}
if (CommutedMI != &MI) {
// New instruction. We can't fold from this.
CommutedMI->eraseFromParent();
return nullptr;
}
// Attempt to fold with the commuted version of the instruction.
NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
Size, Align, /*AllowCommute=*/false);
if (NewMI)
return NewMI;
// Folding failed again - undo the commute before returning.
MachineInstr *UncommutedMI =
commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
if (!UncommutedMI) {
// Unable to commute.
return nullptr;
}
if (UncommutedMI != &MI) {
// New instruction. It doesn't need to be kept.
UncommutedMI->eraseFromParent();
return nullptr;
}
// Return here to prevent duplicate fuse failure report.
return nullptr;
}
}
// No fusion
if (PrintFailedFusing && !MI.isCopy())
dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
return nullptr;
}
/// Return true for all instructions that only update
/// the first 32 or 64-bits of the destination register and leave the rest
/// unmodified. This can be used to avoid folding loads if the instructions
/// only update part of the destination register, and the non-updated part is
/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
/// instructions breaks the partial register dependency and it can improve
/// performance. e.g.:
///
/// movss (%rdi), %xmm0
/// cvtss2sd %xmm0, %xmm0
///
/// Instead of
/// cvtss2sd (%rdi), %xmm0
///
/// FIXME: This should be turned into a TSFlags.
///
static bool hasPartialRegUpdate(unsigned Opcode) {
switch (Opcode) {
case X86::CVTSI2SSrr:
case X86::CVTSI2SSrm:
case X86::CVTSI2SS64rr:
case X86::CVTSI2SS64rm:
case X86::CVTSI2SDrr:
case X86::CVTSI2SDrm:
case X86::CVTSI2SD64rr:
case X86::CVTSI2SD64rm:
case X86::CVTSD2SSrr:
case X86::CVTSD2SSrm:
case X86::CVTSS2SDrr:
case X86::CVTSS2SDrm:
case X86::MOVHPDrm:
case X86::MOVHPSrm:
case X86::MOVLPDrm:
case X86::MOVLPSrm:
case X86::RCPSSr:
case X86::RCPSSm:
case X86::RCPSSr_Int:
case X86::RCPSSm_Int:
case X86::ROUNDSDr:
case X86::ROUNDSDm:
case X86::ROUNDSSr:
case X86::ROUNDSSm:
case X86::RSQRTSSr:
case X86::RSQRTSSm:
case X86::RSQRTSSr_Int:
case X86::RSQRTSSm_Int:
case X86::SQRTSSr:
case X86::SQRTSSm:
case X86::SQRTSSr_Int:
case X86::SQRTSSm_Int:
case X86::SQRTSDr:
case X86::SQRTSDm:
case X86::SQRTSDr_Int:
case X86::SQRTSDm_Int:
return true;
}
return false;
}
/// Inform the ExeDepsFix pass how many idle
/// instructions we would like before a partial register update.
unsigned X86InstrInfo::getPartialRegUpdateClearance(
const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
return 0;
// If MI is marked as reading Reg, the partial register update is wanted.
const MachineOperand &MO = MI.getOperand(0);
unsigned Reg = MO.getReg();
if (TargetRegisterInfo::isVirtualRegister(Reg)) {
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
return 0;
} else {
if (MI.readsRegister(Reg, TRI))
return 0;
}
// If any instructions in the clearance range are reading Reg, insert a
// dependency breaking instruction, which is inexpensive and is likely to
// be hidden in other instruction's cycles.
return PartialRegUpdateClearance;
}
// Return true for any instruction the copies the high bits of the first source
// operand into the unused high bits of the destination operand.
static bool hasUndefRegUpdate(unsigned Opcode) {
switch (Opcode) {
case X86::VCVTSI2SSrr:
case X86::VCVTSI2SSrm:
case X86::Int_VCVTSI2SSrr:
case X86::Int_VCVTSI2SSrm:
case X86::VCVTSI2SS64rr:
case X86::VCVTSI2SS64rm:
case X86::Int_VCVTSI2SS64rr:
case X86::Int_VCVTSI2SS64rm:
case X86::VCVTSI2SDrr:
case X86::VCVTSI2SDrm:
case X86::Int_VCVTSI2SDrr:
case X86::Int_VCVTSI2SDrm:
case X86::VCVTSI2SD64rr:
case X86::VCVTSI2SD64rm:
case X86::Int_VCVTSI2SD64rr:
case X86::Int_VCVTSI2SD64rm:
case X86::VCVTSD2SSrr:
case X86::VCVTSD2SSrm:
case X86::Int_VCVTSD2SSrr:
case X86::Int_VCVTSD2SSrm:
case X86::VCVTSS2SDrr:
case X86::VCVTSS2SDrm:
case X86::Int_VCVTSS2SDrr:
case X86::Int_VCVTSS2SDrm:
case X86::VRCPSSr:
case X86::VRCPSSr_Int:
case X86::VRCPSSm:
case X86::VRCPSSm_Int:
case X86::VROUNDSDr:
case X86::VROUNDSDm:
case X86::VROUNDSDr_Int:
case X86::VROUNDSDm_Int:
case X86::VROUNDSSr:
case X86::VROUNDSSm:
case X86::VROUNDSSr_Int:
case X86::VROUNDSSm_Int:
case X86::VRSQRTSSr:
case X86::VRSQRTSSr_Int:
case X86::VRSQRTSSm:
case X86::VRSQRTSSm_Int:
case X86::VSQRTSSr:
case X86::VSQRTSSr_Int:
case X86::VSQRTSSm:
case X86::VSQRTSSm_Int:
case X86::VSQRTSDr:
case X86::VSQRTSDr_Int:
case X86::VSQRTSDm:
case X86::VSQRTSDm_Int:
// AVX-512
case X86::VCVTSI2SSZrr:
case X86::VCVTSI2SSZrm:
case X86::VCVTSI2SSZrr_Int:
case X86::VCVTSI2SSZrrb_Int:
case X86::VCVTSI2SSZrm_Int:
case X86::VCVTSI642SSZrr:
case X86::VCVTSI642SSZrm:
case X86::VCVTSI642SSZrr_Int:
case X86::VCVTSI642SSZrrb_Int:
case X86::VCVTSI642SSZrm_Int:
case X86::VCVTSI2SDZrr:
case X86::VCVTSI2SDZrm:
case X86::VCVTSI2SDZrr_Int:
case X86::VCVTSI2SDZrrb_Int:
case X86::VCVTSI2SDZrm_Int:
case X86::VCVTSI642SDZrr:
case X86::VCVTSI642SDZrm:
case X86::VCVTSI642SDZrr_Int:
case X86::VCVTSI642SDZrrb_Int:
case X86::VCVTSI642SDZrm_Int:
case X86::VCVTUSI2SSZrr:
case X86::VCVTUSI2SSZrm:
case X86::VCVTUSI2SSZrr_Int:
case X86::VCVTUSI2SSZrrb_Int:
case X86::VCVTUSI2SSZrm_Int:
case X86::VCVTUSI642SSZrr:
case X86::VCVTUSI642SSZrm:
case X86::VCVTUSI642SSZrr_Int:
case X86::VCVTUSI642SSZrrb_Int:
case X86::VCVTUSI642SSZrm_Int:
case X86::VCVTUSI2SDZrr:
case X86::VCVTUSI2SDZrm:
case X86::VCVTUSI2SDZrr_Int:
case X86::VCVTUSI2SDZrm_Int:
case X86::VCVTUSI642SDZrr:
case X86::VCVTUSI642SDZrm:
case X86::VCVTUSI642SDZrr_Int:
case X86::VCVTUSI642SDZrrb_Int:
case X86::VCVTUSI642SDZrm_Int:
case X86::VCVTSD2SSZrr:
case X86::VCVTSD2SSZrrb:
case X86::VCVTSD2SSZrm:
case X86::VCVTSS2SDZrr:
case X86::VCVTSS2SDZrrb:
case X86::VCVTSS2SDZrm:
case X86::VRNDSCALESDr:
case X86::VRNDSCALESDrb:
case X86::VRNDSCALESDm:
case X86::VRNDSCALESSr:
case X86::VRNDSCALESSrb:
case X86::VRNDSCALESSm:
case X86::VRCP14SSrr:
case X86::VRCP14SSrm:
case X86::VRSQRT14SSrr:
case X86::VRSQRT14SSrm:
case X86::VSQRTSSZr:
case X86::VSQRTSSZr_Int:
case X86::VSQRTSSZrb_Int:
case X86::VSQRTSSZm:
case X86::VSQRTSSZm_Int:
case X86::VSQRTSDZr:
case X86::VSQRTSDZr_Int:
case X86::VSQRTSDZrb_Int:
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
return true;
}
return false;
}
/// Inform the ExeDepsFix pass how many idle instructions we would like before
/// certain undef register reads.
///
/// This catches the VCVTSI2SD family of instructions:
///
/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
///
/// We should to be careful *not* to catch VXOR idioms which are presumably
/// handled specially in the pipeline:
///
/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
///
/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
/// high bits that are passed-through are not live.
unsigned
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const {
if (!hasUndefRegUpdate(MI.getOpcode()))
return 0;
// Set the OpNum parameter to the first source operand.
OpNum = 1;
const MachineOperand &MO = MI.getOperand(OpNum);
if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
return UndefRegClearance;
}
return 0;
}
void X86InstrInfo::breakPartialRegDependency(
MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
unsigned Reg = MI.getOperand(OpNum).getReg();
// If MI kills this register, the false dependence is already broken.
if (MI.killsRegister(Reg, TRI))
return;
if (X86::VR128RegClass.contains(Reg)) {
// These instructions are all floating point domain, so xorps is the best
// choice.
unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
.addReg(Reg, RegState::Undef)
.addReg(Reg, RegState::Undef);
MI.addRegisterKilled(Reg, TRI, true);
} else if (X86::VR256RegClass.contains(Reg)) {
// Use vxorps to clear the full ymm register.
// It wants to read and write the xmm sub-register.
unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
.addReg(XReg, RegState::Undef)
.addReg(XReg, RegState::Undef)
.addReg(Reg, RegState::ImplicitDefine);
MI.addRegisterKilled(Reg, TRI, true);
}
}
MachineInstr *
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt,
int FrameIndex, LiveIntervals *LIS) const {
// Check switch flag
if (NoFusing)
return nullptr;
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
// Don't fold subreg spills, or reloads that use a high subreg.
for (auto Op : Ops) {
MachineOperand &MO = MI.getOperand(Op);
auto SubReg = MO.getSubReg();
if (SubReg && (MO.isDef() || SubReg == X86::sub_8bit_hi))
return nullptr;
}
const MachineFrameInfo &MFI = MF.getFrameInfo();
unsigned Size = MFI.getObjectSize(FrameIndex);
unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
// If the function stack isn't realigned we don't want to fold instructions
// that need increased alignment.
if (!RI.needsStackRealignment(MF))
Alignment =
std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
unsigned NewOpc = 0;
unsigned RCSize = 0;
switch (MI.getOpcode()) {
default: return nullptr;
case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break;
}
// Check if it's safe to fold the load. If the size of the object is
// narrower than the load width, then it's not.
if (Size < RCSize)
return nullptr;
// Change to CMPXXri r, 0 first.
MI.setDesc(get(NewOpc));
MI.getOperand(1).ChangeToImmediate(0);
} else if (Ops.size() != 1)
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops[0],
MachineOperand::CreateFI(FrameIndex), InsertPt,
Size, Alignment, /*AllowCommute=*/true);
}
/// Check if \p LoadMI is a partial register load that we can't fold into \p MI
/// because the latter uses contents that wouldn't be defined in the folded
/// version. For instance, this transformation isn't legal:
/// movss (%rdi), %xmm0
/// addps %xmm0, %xmm0
/// ->
/// addps (%rdi), %xmm0
///
/// But this one is:
/// movss (%rdi), %xmm0
/// addss %xmm0, %xmm0
/// ->
/// addss (%rdi), %xmm0
///
static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
const MachineInstr &UserMI,
const MachineFunction &MF) {
unsigned Opc = LoadMI.getOpcode();
unsigned UserOpc = UserMI.getOpcode();
unsigned RegSize =
MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg())->getSize();
if ((Opc == X86::MOVSSrm || Opc == X86::VMOVSSrm || Opc == X86::VMOVSSZrm) &&
RegSize > 4) {
// These instructions only load 32 bits, we can't fold them if the
// destination register is wider than 32 bits (4 bytes), and its user
// instruction isn't scalar (SS).
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
case X86::VFMADDSS4rr_Int: case X86::VFNMADDSS4rr_Int:
case X86::VFMSUBSS4rr_Int: case X86::VFNMSUBSS4rr_Int:
case X86::VFMADD132SSr_Int: case X86::VFNMADD132SSr_Int:
case X86::VFMADD213SSr_Int: case X86::VFNMADD213SSr_Int:
case X86::VFMADD231SSr_Int: case X86::VFNMADD231SSr_Int:
case X86::VFMSUB132SSr_Int: case X86::VFNMSUB132SSr_Int:
case X86::VFMSUB213SSr_Int: case X86::VFNMSUB213SSr_Int:
case X86::VFMSUB231SSr_Int: case X86::VFNMSUB231SSr_Int:
case X86::VFMADD132SSZr_Int: case X86::VFNMADD132SSZr_Int:
case X86::VFMADD213SSZr_Int: case X86::VFNMADD213SSZr_Int:
case X86::VFMADD231SSZr_Int: case X86::VFNMADD231SSZr_Int:
case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
return false;
default:
return true;
}
}
if ((Opc == X86::MOVSDrm || Opc == X86::VMOVSDrm || Opc == X86::VMOVSDZrm) &&
RegSize > 8) {
// These instructions only load 64 bits, we can't fold them if the
// destination register is wider than 64 bits (8 bytes), and its user
// instruction isn't scalar (SD).
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
case X86::VFMADDSD4rr_Int: case X86::VFNMADDSD4rr_Int:
case X86::VFMSUBSD4rr_Int: case X86::VFNMSUBSD4rr_Int:
case X86::VFMADD132SDr_Int: case X86::VFNMADD132SDr_Int:
case X86::VFMADD213SDr_Int: case X86::VFNMADD213SDr_Int:
case X86::VFMADD231SDr_Int: case X86::VFNMADD231SDr_Int:
case X86::VFMSUB132SDr_Int: case X86::VFNMSUB132SDr_Int:
case X86::VFMSUB213SDr_Int: case X86::VFNMSUB213SDr_Int:
case X86::VFMSUB231SDr_Int: case X86::VFNMSUB231SDr_Int:
case X86::VFMADD132SDZr_Int: case X86::VFNMADD132SDZr_Int:
case X86::VFMADD213SDZr_Int: case X86::VFNMADD213SDZr_Int:
case X86::VFMADD231SDZr_Int: case X86::VFNMADD231SDZr_Int:
case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
return false;
default:
return true;
}
}
return false;
}
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS) const {
// TODO: Support the case where LoadMI loads a wide register, but MI
// only uses a subreg.
for (auto Op : Ops) {
if (MI.getOperand(Op).getSubReg())
return nullptr;
}
// If loading from a FrameIndex, fold directly from the FrameIndex.
unsigned NumOps = LoadMI.getDesc().getNumOperands();
int FrameIndex;
if (isLoadFromStackSlot(LoadMI, FrameIndex)) {
if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
return nullptr;
return foldMemoryOperandImpl(MF, MI, Ops, InsertPt, FrameIndex, LIS);
}
// Check switch flag
if (NoFusing) return nullptr;
// Avoid partial register update stalls unless optimizing for size.
if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
// Determine the alignment of the load.
unsigned Alignment = 0;
if (LoadMI.hasOneMemOperand())
Alignment = (*LoadMI.memoperands_begin())->getAlignment();
else
switch (LoadMI.getOpcode()) {
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
Alignment = 64;
break;
case X86::AVX2_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_256_SET0:
Alignment = 32;
break;
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX512_128_SET0:
Alignment = 16;
break;
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
Alignment = 8;
break;
case X86::FsFLD0SS:
case X86::AVX512_FsFLD0SS:
Alignment = 4;
break;
default:
return nullptr;
}
if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
unsigned NewOpc = 0;
switch (MI.getOpcode()) {
default: return nullptr;
case X86::TEST8rr: NewOpc = X86::CMP8ri; break;
case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
case X86::TEST64rr: NewOpc = X86::CMP64ri8; break;
}
// Change to CMPXXri r, 0 first.
MI.setDesc(get(NewOpc));
MI.getOperand(1).ChangeToImmediate(0);
} else if (Ops.size() != 1)
return nullptr;
// Make sure the subregisters match.
// Otherwise we risk changing the size of the load.
if (LoadMI.getOperand(0).getSubReg() != MI.getOperand(Ops[0]).getSubReg())
return nullptr;
SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
switch (LoadMI.getOpcode()) {
case X86::V_SET0:
case X86::V_SETALLONES:
case X86::AVX2_SETALLONES:
case X86::AVX_SET0:
case X86::AVX512_128_SET0:
case X86::AVX512_256_SET0:
case X86::AVX512_512_SET0:
case X86::AVX512_512_SETALLONES:
case X86::FsFLD0SD:
case X86::AVX512_FsFLD0SD:
case X86::FsFLD0SS:
case X86::AVX512_FsFLD0SS: {
// Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
// Create a constant-pool entry and operands to load from it.
// Medium and large mode can't fold loads this way.
if (MF.getTarget().getCodeModel() != CodeModel::Small &&
MF.getTarget().getCodeModel() != CodeModel::Kernel)
return nullptr;
// x86-32 PIC requires a PIC base register for constant pools.
unsigned PICBase = 0;
if (MF.getTarget().isPositionIndependent()) {
if (Subtarget.is64Bit())
PICBase = X86::RIP;
else
// FIXME: PICBase = getGlobalBaseReg(&MF);
// This doesn't work for several reasons.
// 1. GlobalBaseReg may have been spilled.
// 2. It may not be live at MI.
return nullptr;
}
// Create a constant-pool entry.
MachineConstantPool &MCP = *MF.getConstantPool();
Type *Ty;
unsigned Opc = LoadMI.getOpcode();
if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
Ty = Type::getFloatTy(MF.getFunction()->getContext());
else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
Ty = Type::getDoubleTy(MF.getFunction()->getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0)
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
else
Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
Opc == X86::AVX512_512_SETALLONES);
const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
Constant::getNullValue(Ty);
unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
// Create operands to load from the constant pool entry.
MOs.push_back(MachineOperand::CreateReg(PICBase, false));
MOs.push_back(MachineOperand::CreateImm(1));
MOs.push_back(MachineOperand::CreateReg(0, false));
MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
MOs.push_back(MachineOperand::CreateReg(0, false));
break;
}
default: {
if (isNonFoldablePartialRegisterLoad(LoadMI, MI, MF))
return nullptr;
// Folding a normal load. Just copy the load's address operands.
MOs.append(LoadMI.operands_begin() + NumOps - X86::AddrNumOperands,
LoadMI.operands_begin() + NumOps);
break;
}
}
return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, InsertPt,
/*Size=*/0, Alignment, /*AllowCommute=*/true);
}
bool X86InstrInfo::unfoldMemoryOperand(
MachineFunction &MF, MachineInstr &MI, unsigned Reg, bool UnfoldLoad,
bool UnfoldStore, SmallVectorImpl<MachineInstr *> &NewMIs) const {
auto I = MemOp2RegOpTable.find(MI.getOpcode());
if (I == MemOp2RegOpTable.end())
return false;
unsigned Opc = I->second.first;
unsigned Index = I->second.second & TB_INDEX_MASK;
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
if (UnfoldLoad && !FoldedLoad)
return false;
UnfoldLoad &= FoldedLoad;
if (UnfoldStore && !FoldedStore)
return false;
UnfoldStore &= FoldedStore;
const MCInstrDesc &MCID = get(Opc);
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
// TODO: Check if 32-byte or greater accesses are slow too?
if (!MI.hasOneMemOperand() && RC == &X86::VR128RegClass &&
Subtarget.isUnalignedMem16Slow())
// Without memoperands, loadRegFromAddr and storeRegToStackSlot will
// conservatively assume the address is unaligned. That's bad for
// performance.
return false;
SmallVector<MachineOperand, X86::AddrNumOperands> AddrOps;
SmallVector<MachineOperand,2> BeforeOps;
SmallVector<MachineOperand,2> AfterOps;
SmallVector<MachineOperand,4> ImpOps;
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &Op = MI.getOperand(i);
if (i >= Index && i < Index + X86::AddrNumOperands)
AddrOps.push_back(Op);
else if (Op.isReg() && Op.isImplicit())
ImpOps.push_back(Op);
else if (i < Index)
BeforeOps.push_back(Op);
else if (i > Index)
AfterOps.push_back(Op);
}
// Emit the load instruction.
if (UnfoldLoad) {
std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
MF.extractLoadMemRefs(MI.memoperands_begin(), MI.memoperands_end());
loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
if (UnfoldStore) {
// Address operands cannot be marked isKill.
for (unsigned i = 1; i != 1 + X86::AddrNumOperands; ++i) {
MachineOperand &MO = NewMIs[0]->getOperand(i);
if (MO.isReg())
MO.setIsKill(false);
}
}
}
// Emit the data processing instruction.
MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI.getDebugLoc(), true);
MachineInstrBuilder MIB(MF, DataMI);
if (FoldedStore)
MIB.addReg(Reg, RegState::Define);
for (MachineOperand &BeforeOp : BeforeOps)
MIB.addOperand(BeforeOp);
if (FoldedLoad)
MIB.addReg(Reg);
for (MachineOperand &AfterOp : AfterOps)
MIB.addOperand(AfterOp);
for (MachineOperand &ImpOp : ImpOps) {
MIB.addReg(ImpOp.getReg(),
getDefRegState(ImpOp.isDef()) |
RegState::Implicit |
getKillRegState(ImpOp.isKill()) |
getDeadRegState(ImpOp.isDead()) |
getUndefRegState(ImpOp.isUndef()));
}
// Change CMP32ri r, 0 back to TEST32rr r, r, etc.
switch (DataMI->getOpcode()) {
default: break;
case X86::CMP64ri32:
case X86::CMP64ri8:
case X86::CMP32ri:
case X86::CMP32ri8:
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP8ri: {
MachineOperand &MO0 = DataMI->getOperand(0);
MachineOperand &MO1 = DataMI->getOperand(1);
if (MO1.getImm() == 0) {
unsigned NewOpc;
switch (DataMI->getOpcode()) {
default: llvm_unreachable("Unreachable!");
case X86::CMP64ri8:
case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
case X86::CMP32ri8:
case X86::CMP32ri: NewOpc = X86::TEST32rr; break;
case X86::CMP16ri8:
case X86::CMP16ri: NewOpc = X86::TEST16rr; break;
case X86::CMP8ri: NewOpc = X86::TEST8rr; break;
}
DataMI->setDesc(get(NewOpc));
MO1.ChangeToRegister(MO0.getReg(), false);
}
}
}
NewMIs.push_back(DataMI);
// Emit the store instruction.
if (UnfoldStore) {
const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator> MMOs =
MF.extractStoreMemRefs(MI.memoperands_begin(), MI.memoperands_end());
storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
}
return true;
}
bool
X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
SmallVectorImpl<SDNode*> &NewNodes) const {
if (!N->isMachineOpcode())
return false;
auto I = MemOp2RegOpTable.find(N->getMachineOpcode());
if (I == MemOp2RegOpTable.end())
return false;
unsigned Opc = I->second.first;
unsigned Index = I->second.second & TB_INDEX_MASK;
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
const MCInstrDesc &MCID = get(Opc);
MachineFunction &MF = DAG.getMachineFunction();
const TargetRegisterClass *RC = getRegClass(MCID, Index, &RI, MF);
unsigned NumDefs = MCID.NumDefs;
std::vector<SDValue> AddrOps;
std::vector<SDValue> BeforeOps;
std::vector<SDValue> AfterOps;
SDLoc dl(N);
unsigned NumOps = N->getNumOperands();
for (unsigned i = 0; i != NumOps-1; ++i) {
SDValue Op = N->getOperand(i);
if (i >= Index-NumDefs && i < Index-NumDefs + X86::AddrNumOperands)
AddrOps.push_back(Op);
else if (i < Index-NumDefs)
BeforeOps.push_back(Op);
else if (i > Index-NumDefs)
AfterOps.push_back(Op);
}
SDValue Chain = N->getOperand(NumOps-1);
AddrOps.push_back(Chain);
// Emit the load instruction.
SDNode *Load = nullptr;
if (FoldedLoad) {
EVT VT = *RC->vt_begin();
std::pair<MachineInstr::mmo_iterator,
MachineInstr::mmo_iterator> MMOs =
MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned load.
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, Subtarget), dl,
VT, MVT::Other, AddrOps);
NewNodes.push_back(Load);
// Preserve memory reference information.
cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
}
// Emit the data processing instruction.
std::vector<EVT> VTs;
const TargetRegisterClass *DstRC = nullptr;
if (MCID.getNumDefs() > 0) {
DstRC = getRegClass(MCID, 0, &RI, MF);
VTs.push_back(*DstRC->vt_begin());
}
for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
EVT VT = N->getValueType(i);
if (VT != MVT::Other && i >= (unsigned)MCID.getNumDefs())
VTs.push_back(VT);
}
if (Load)
BeforeOps.push_back(SDValue(Load, 0));
BeforeOps.insert(BeforeOps.end(), AfterOps.begin(), AfterOps.end());
SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, BeforeOps);
NewNodes.push_back(NewNode);
// Emit the store instruction.
if (FoldedStore) {
AddrOps.pop_back();
AddrOps.push_back(SDValue(NewNode, 0));
AddrOps.push_back(Chain);
std::pair<MachineInstr::mmo_iterator,
MachineInstr::mmo_iterator> MMOs =
MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
cast<MachineSDNode>(N)->memoperands_end());
if (!(*MMOs.first) &&
RC == &X86::VR128RegClass &&
Subtarget.isUnalignedMem16Slow())
// Do not introduce a slow unaligned store.
return false;
// FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
// memory access is slow above.
unsigned Alignment = std::max<uint32_t>(RC->getSize(), 16);
bool isAligned = (*MMOs.first) &&
(*MMOs.first)->getAlignment() >= Alignment;
SDNode *Store =
DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
dl, MVT::Other, AddrOps);
NewNodes.push_back(Store);
// Preserve memory reference information.
cast<MachineSDNode>(Store)->setMemRefs(MMOs.first, MMOs.second);
}
return true;
}
unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
unsigned *LoadRegIndex) const {
auto I = MemOp2RegOpTable.find(Opc);
if (I == MemOp2RegOpTable.end())
return 0;
bool FoldedLoad = I->second.second & TB_FOLDED_LOAD;
bool FoldedStore = I->second.second & TB_FOLDED_STORE;
if (UnfoldLoad && !FoldedLoad)
return 0;
if (UnfoldStore && !FoldedStore)
return 0;
if (LoadRegIndex)
*LoadRegIndex = I->second.second & TB_INDEX_MASK;
return I->second.first;
}
bool
X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1, int64_t &Offset2) const {
if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
return false;
unsigned Opc1 = Load1->getMachineOpcode();
unsigned Opc2 = Load2->getMachineOpcode();
switch (Opc1) {
default: return false;
case X86::MOV8rm:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::LD_Fp32m:
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU16Z256rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU64Z256rm:
case X86::VMOVAPSZrm:
case X86::VMOVUPSZrm:
case X86::VMOVAPDZrm:
case X86::VMOVUPDZrm:
case X86::VMOVDQU8Zrm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU64Zrm:
case X86::KMOVBkm:
case X86::KMOVWkm:
case X86::KMOVDkm:
case X86::KMOVQkm:
break;
}
switch (Opc2) {
default: return false;
case X86::MOV8rm:
case X86::MOV16rm:
case X86::MOV32rm:
case X86::MOV64rm:
case X86::LD_Fp32m:
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MOVSSrm:
case X86::MOVSDrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::MOVAPSrm:
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVUPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
// AVX load instructions
case X86::VMOVSSrm:
case X86::VMOVSDrm:
case X86::VMOVAPSrm:
case X86::VMOVUPSrm:
case X86::VMOVAPDrm:
case X86::VMOVUPDrm:
case X86::VMOVDQArm:
case X86::VMOVDQUrm:
case X86::VMOVAPSYrm:
case X86::VMOVUPSYrm:
case X86::VMOVAPDYrm:
case X86::VMOVUPDYrm:
case X86::VMOVDQAYrm:
case X86::VMOVDQUYrm:
// AVX512 load instructions
case X86::VMOVSSZrm:
case X86::VMOVSDZrm:
case X86::VMOVAPSZ128rm:
case X86::VMOVUPSZ128rm:
case X86::VMOVAPSZ128rm_NOVLX:
case X86::VMOVUPSZ128rm_NOVLX:
case X86::VMOVAPDZ128rm:
case X86::VMOVUPDZ128rm:
case X86::VMOVDQU8Z128rm:
case X86::VMOVDQU16Z128rm:
case X86::VMOVDQA32Z128rm:
case X86::VMOVDQU32Z128rm:
case X86::VMOVDQA64Z128rm:
case X86::VMOVDQU64Z128rm:
case X86::VMOVAPSZ256rm:
case X86::VMOVUPSZ256rm:
case X86::VMOVAPSZ256rm_NOVLX:
case X86::VMOVUPSZ256rm_NOVLX:
case X86::VMOVAPDZ256rm:
case X86::VMOVUPDZ256rm:
case X86::VMOVDQU8Z256rm:
case X86::VMOVDQU16Z256rm:
case X86::VMOVDQA32Z256rm:
case X86::VMOVDQU32Z256rm:
case X86::VMOVDQA64Z256rm:
case X86::VMOVDQU64Z256rm:
case X86::VMOVAPSZrm:
case X86::VMOVUPSZrm:
case X86::VMOVAPDZrm:
case X86::VMOVUPDZrm:
case X86::VMOVDQU8Zrm:
case X86::VMOVDQU16Zrm:
case X86::VMOVDQA32Zrm:
case X86::VMOVDQU32Zrm:
case X86::VMOVDQA64Zrm:
case X86::VMOVDQU64Zrm:
case X86::KMOVBkm:
case X86::KMOVWkm:
case X86::KMOVDkm:
case X86::KMOVQkm:
break;
}
// Check if chain operands and base addresses match.
if (Load1->getOperand(0) != Load2->getOperand(0) ||
Load1->getOperand(5) != Load2->getOperand(5))
return false;
// Segment operands should match as well.
if (Load1->getOperand(4) != Load2->getOperand(4))
return false;
// Scale should be 1, Index should be Reg0.
if (Load1->getOperand(1) == Load2->getOperand(1) &&
Load1->getOperand(2) == Load2->getOperand(2)) {
if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
return false;
// Now let's examine the displacements.
if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
isa<ConstantSDNode>(Load2->getOperand(3))) {
Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
return true;
}
}
return false;
}
bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const {
assert(Offset2 > Offset1);
if ((Offset2 - Offset1) / 8 > 64)
return false;
unsigned Opc1 = Load1->getMachineOpcode();
unsigned Opc2 = Load2->getMachineOpcode();
if (Opc1 != Opc2)
return false; // FIXME: overly conservative?
switch (Opc1) {
default: break;
case X86::LD_Fp32m:
case X86::LD_Fp64m:
case X86::LD_Fp80m:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
return false;
}
EVT VT = Load1->getValueType(0);
switch (VT.getSimpleVT().SimpleTy) {
default:
// XMM registers. In 64-bit mode we can be a bit more aggressive since we
// have 16 of them to play with.
if (Subtarget.is64Bit()) {
if (NumLoads >= 3)
return false;
} else if (NumLoads) {
return false;
}
break;
case MVT::i8:
case MVT::i16:
case MVT::i32:
case MVT::i64:
case MVT::f32:
case MVT::f64:
if (NumLoads)
return false;
break;
}
return true;
}
bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
const MachineInstr &Second) const {
// Check if this processor supports macro-fusion. Since this is a minor
// heuristic, we haven't specifically reserved a feature. hasAVX is a decent
// proxy for SandyBridge+.
if (!Subtarget.hasAVX())
return false;
enum {
FuseTest,
FuseCmp,
FuseInc
} FuseKind;
switch (Second.getOpcode()) {
default:
return false;
case X86::JE_1:
case X86::JNE_1:
case X86::JL_1:
case X86::JLE_1:
case X86::JG_1:
case X86::JGE_1:
FuseKind = FuseInc;
break;
case X86::JB_1:
case X86::JBE_1:
case X86::JA_1:
case X86::JAE_1:
FuseKind = FuseCmp;
break;
case X86::JS_1:
case X86::JNS_1:
case X86::JP_1:
case X86::JNP_1:
case X86::JO_1:
case X86::JNO_1:
FuseKind = FuseTest;
break;
}
switch (First.getOpcode()) {
default:
return false;
case X86::TEST8rr:
case X86::TEST16rr:
case X86::TEST32rr:
case X86::TEST64rr:
case X86::TEST8ri:
case X86::TEST16ri:
case X86::TEST32ri:
case X86::TEST32i32:
case X86::TEST64i32:
case X86::TEST64ri32:
case X86::TEST8rm:
case X86::TEST16rm:
case X86::TEST32rm:
case X86::TEST64rm:
case X86::TEST8ri_NOREX:
case X86::AND16i16:
case X86::AND16ri:
case X86::AND16ri8:
case X86::AND16rm:
case X86::AND16rr:
case X86::AND32i32:
case X86::AND32ri:
case X86::AND32ri8:
case X86::AND32rm:
case X86::AND32rr:
case X86::AND64i32:
case X86::AND64ri32:
case X86::AND64ri8:
case X86::AND64rm:
case X86::AND64rr:
case X86::AND8i8:
case X86::AND8ri:
case X86::AND8rm:
case X86::AND8rr:
return true;
case X86::CMP16i16:
case X86::CMP16ri:
case X86::CMP16ri8:
case X86::CMP16rm:
case X86::CMP16rr:
case X86::CMP32i32:
case X86::CMP32ri:
case X86::CMP32ri8:
case X86::CMP32rm:
case X86::CMP32rr:
case X86::CMP64i32:
case X86::CMP64ri32:
case X86::CMP64ri8:
case X86::CMP64rm:
case X86::CMP64rr:
case X86::CMP8i8:
case X86::CMP8ri:
case X86::CMP8rm:
case X86::CMP8rr:
case X86::ADD16i16:
case X86::ADD16ri:
case X86::ADD16ri8:
case X86::ADD16ri8_DB:
case X86::ADD16ri_DB:
case X86::ADD16rm:
case X86::ADD16rr:
case X86::ADD16rr_DB:
case X86::ADD32i32:
case X86::ADD32ri:
case X86::ADD32ri8:
case X86::ADD32ri8_DB:
case X86::ADD32ri_DB:
case X86::ADD32rm:
case X86::ADD32rr:
case X86::ADD32rr_DB:
case X86::ADD64i32:
case X86::ADD64ri32:
case X86::ADD64ri32_DB:
case X86::ADD64ri8:
case X86::ADD64ri8_DB:
case X86::ADD64rm:
case X86::ADD64rr:
case X86::ADD64rr_DB:
case X86::ADD8i8:
case X86::ADD8mi:
case X86::ADD8mr:
case X86::ADD8ri:
case X86::ADD8rm:
case X86::ADD8rr:
case X86::SUB16i16:
case X86::SUB16ri:
case X86::SUB16ri8:
case X86::SUB16rm:
case X86::SUB16rr:
case X86::SUB32i32:
case X86::SUB32ri:
case X86::SUB32ri8:
case X86::SUB32rm:
case X86::SUB32rr:
case X86::SUB64i32:
case X86::SUB64ri32:
case X86::SUB64ri8:
case X86::SUB64rm:
case X86::SUB64rr:
case X86::SUB8i8:
case X86::SUB8ri:
case X86::SUB8rm:
case X86::SUB8rr:
return FuseKind == FuseCmp || FuseKind == FuseInc;
case X86::INC16r:
case X86::INC32r:
case X86::INC64r:
case X86::INC8r:
case X86::DEC16r:
case X86::DEC32r:
case X86::DEC64r:
case X86::DEC8r:
return FuseKind == FuseInc;
}
}
bool X86InstrInfo::
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
assert(Cond.size() == 1 && "Invalid X86 branch condition!");
X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
Cond[0].setImm(GetOppositeBranchCondition(CC));
return false;
}
bool X86InstrInfo::
isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
// FIXME: Return false for x87 stack register classes for now. We can't
// allow any loads of these registers before FpGet_ST0_80.
return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
}
/// Return a virtual register initialized with the
/// the global base register value. Output instructions required to
/// initialize the register in the function entry block, if necessary.
///
/// TODO: Eliminate this and move the code to X86MachineFunctionInfo.
///
unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
assert(!Subtarget.is64Bit() &&
"X86-64 PIC uses RIP relative addressing");
X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
if (GlobalBaseReg != 0)
return GlobalBaseReg;
// Create the register. The code to initialize it is inserted
// later, by the CGBR pass (below).
MachineRegisterInfo &RegInfo = MF->getRegInfo();
GlobalBaseReg = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
X86FI->setGlobalBaseReg(GlobalBaseReg);
return GlobalBaseReg;
}
// These are the replaceable SSE instructions. Some of these have Int variants
// that we don't include here. We don't want to replace instructions selected
// by intrinsics.
static const uint16_t ReplaceableInstrs[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::MOVAPSmr, X86::MOVAPDmr, X86::MOVDQAmr },
{ X86::MOVAPSrm, X86::MOVAPDrm, X86::MOVDQArm },
{ X86::MOVAPSrr, X86::MOVAPDrr, X86::MOVDQArr },
{ X86::MOVUPSmr, X86::MOVUPDmr, X86::MOVDQUmr },
{ X86::MOVUPSrm, X86::MOVUPDrm, X86::MOVDQUrm },
{ X86::MOVLPSmr, X86::MOVLPDmr, X86::MOVPQI2QImr },
{ X86::MOVSSmr, X86::MOVSSmr, X86::MOVPDI2DImr },
{ X86::MOVSDrm, X86::MOVSDrm, X86::MOVQI2PQIrm },
{ X86::MOVSSrm, X86::MOVSSrm, X86::MOVDI2PDIrm },
{ X86::MOVNTPSmr, X86::MOVNTPDmr, X86::MOVNTDQmr },
{ X86::ANDNPSrm, X86::ANDNPDrm, X86::PANDNrm },
{ X86::ANDNPSrr, X86::ANDNPDrr, X86::PANDNrr },
{ X86::ANDPSrm, X86::ANDPDrm, X86::PANDrm },
{ X86::ANDPSrr, X86::ANDPDrr, X86::PANDrr },
{ X86::ORPSrm, X86::ORPDrm, X86::PORrm },
{ X86::ORPSrr, X86::ORPDrr, X86::PORrr },
{ X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
{ X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
// AVX 128-bit support
{ X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
{ X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
{ X86::VMOVAPSrr, X86::VMOVAPDrr, X86::VMOVDQArr },
{ X86::VMOVUPSmr, X86::VMOVUPDmr, X86::VMOVDQUmr },
{ X86::VMOVUPSrm, X86::VMOVUPDrm, X86::VMOVDQUrm },
{ X86::VMOVLPSmr, X86::VMOVLPDmr, X86::VMOVPQI2QImr },
{ X86::VMOVSSmr, X86::VMOVSSmr, X86::VMOVPDI2DImr },
{ X86::VMOVSDrm, X86::VMOVSDrm, X86::VMOVQI2PQIrm },
{ X86::VMOVSSrm, X86::VMOVSSrm, X86::VMOVDI2PDIrm },
{ X86::VMOVNTPSmr, X86::VMOVNTPDmr, X86::VMOVNTDQmr },
{ X86::VANDNPSrm, X86::VANDNPDrm, X86::VPANDNrm },
{ X86::VANDNPSrr, X86::VANDNPDrr, X86::VPANDNrr },
{ X86::VANDPSrm, X86::VANDPDrm, X86::VPANDrm },
{ X86::VANDPSrr, X86::VANDPDrr, X86::VPANDrr },
{ X86::VORPSrm, X86::VORPDrm, X86::VPORrm },
{ X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
// AVX 256-bit support
{ X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
{ X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
{ X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
// AVX512 support
{ X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
{ X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
{ X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
{ X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr },
{ X86::VMOVSDZmr, X86::VMOVSDZmr, X86::VMOVPQI2QIZmr },
{ X86::VMOVSSZmr, X86::VMOVSSZmr, X86::VMOVPDI2DIZmr },
{ X86::VMOVSDZrm, X86::VMOVSDZrm, X86::VMOVQI2PQIZrm },
{ X86::VMOVSSZrm, X86::VMOVSSZrm, X86::VMOVDI2PDIZrm },
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
{ X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
{ X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZr, X86::VPBROADCASTDZr },
{ X86::VBROADCASTSSZm, X86::VBROADCASTSSZm, X86::VPBROADCASTDZm },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
{ X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
{ X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
};
static const uint16_t ReplaceableInstrsAVX2[][3] = {
//PackedSingle PackedDouble PackedInt
{ X86::VANDNPSYrm, X86::VANDNPDYrm, X86::VPANDNYrm },
{ X86::VANDNPSYrr, X86::VANDNPDYrr, X86::VPANDNYrr },
{ X86::VANDPSYrm, X86::VANDPDYrm, X86::VPANDYrm },
{ X86::VANDPSYrr, X86::VANDPDYrr, X86::VPANDYrr },
{ X86::VORPSYrm, X86::VORPDYrm, X86::VPORYrm },
{ X86::VORPSYrr, X86::VORPDYrr, X86::VPORYrr },
{ X86::VXORPSYrm, X86::VXORPDYrm, X86::VPXORYrm },
{ X86::VXORPSYrr, X86::VXORPDYrr, X86::VPXORYrr },
{ X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
{ X86::VINSERTF128rm, X86::VINSERTF128rm, X86::VINSERTI128rm },
{ X86::VINSERTF128rr, X86::VINSERTF128rr, X86::VINSERTI128rr },
{ X86::VPERM2F128rm, X86::VPERM2F128rm, X86::VPERM2I128rm },
{ X86::VPERM2F128rr, X86::VPERM2F128rr, X86::VPERM2I128rr },
{ X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrr, X86::VPBROADCASTDrr},
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrr, X86::VPBROADCASTDYrr},
{ X86::VBROADCASTSSYrm, X86::VBROADCASTSSYrm, X86::VPBROADCASTDYrm},
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
};
static const uint16_t ReplaceableInstrsAVX512[][4] = {
// Two integer columns for 64-bit and 32-bit elements.
//PackedSingle PackedDouble PackedInt PackedInt
{ X86::VMOVAPSZ128mr, X86::VMOVAPDZ128mr, X86::VMOVDQA64Z128mr, X86::VMOVDQA32Z128mr },
{ X86::VMOVAPSZ128rm, X86::VMOVAPDZ128rm, X86::VMOVDQA64Z128rm, X86::VMOVDQA32Z128rm },
{ X86::VMOVAPSZ128rr, X86::VMOVAPDZ128rr, X86::VMOVDQA64Z128rr, X86::VMOVDQA32Z128rr },
{ X86::VMOVUPSZ128mr, X86::VMOVUPDZ128mr, X86::VMOVDQU64Z128mr, X86::VMOVDQU32Z128mr },
{ X86::VMOVUPSZ128rm, X86::VMOVUPDZ128rm, X86::VMOVDQU64Z128rm, X86::VMOVDQU32Z128rm },
{ X86::VMOVAPSZ256mr, X86::VMOVAPDZ256mr, X86::VMOVDQA64Z256mr, X86::VMOVDQA32Z256mr },
{ X86::VMOVAPSZ256rm, X86::VMOVAPDZ256rm, X86::VMOVDQA64Z256rm, X86::VMOVDQA32Z256rm },
{ X86::VMOVAPSZ256rr, X86::VMOVAPDZ256rr, X86::VMOVDQA64Z256rr, X86::VMOVDQA32Z256rr },
{ X86::VMOVUPSZ256mr, X86::VMOVUPDZ256mr, X86::VMOVDQU64Z256mr, X86::VMOVDQU32Z256mr },
{ X86::VMOVUPSZ256rm, X86::VMOVUPDZ256rm, X86::VMOVDQU64Z256rm, X86::VMOVDQU32Z256rm },
{ X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA32Zmr },
{ X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA32Zrm },
{ X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA32Zrr },
{ X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU32Zmr },
{ X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU32Zrm },
};
static const uint16_t ReplaceableInstrsAVX512DQ[][4] = {
// Two integer columns for 64-bit and 32-bit elements.
//PackedSingle PackedDouble PackedInt PackedInt
{ X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm },
{ X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr },
{ X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm },
{ X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr },
{ X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm },
{ X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr },
{ X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm },
{ X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr },
{ X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm },
{ X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr },
{ X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm },
{ X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr },
{ X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm },
{ X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr },
{ X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm },
{ X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr },
{ X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm },
{ X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr },
{ X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm },
{ X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr },
{ X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm },
{ X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr },
{ X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm },
{ X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr },
};
static const uint16_t ReplaceableInstrsAVX512DQMasked[][4] = {
// Two integer columns for 64-bit and 32-bit elements.
//PackedSingle PackedDouble
//PackedInt PackedInt
{ X86::VANDNPSZ128rmk, X86::VANDNPDZ128rmk,
X86::VPANDNQZ128rmk, X86::VPANDNDZ128rmk },
{ X86::VANDNPSZ128rmkz, X86::VANDNPDZ128rmkz,
X86::VPANDNQZ128rmkz, X86::VPANDNDZ128rmkz },
{ X86::VANDNPSZ128rrk, X86::VANDNPDZ128rrk,
X86::VPANDNQZ128rrk, X86::VPANDNDZ128rrk },
{ X86::VANDNPSZ128rrkz, X86::VANDNPDZ128rrkz,
X86::VPANDNQZ128rrkz, X86::VPANDNDZ128rrkz },
{ X86::VANDPSZ128rmk, X86::VANDPDZ128rmk,
X86::VPANDQZ128rmk, X86::VPANDDZ128rmk },
{ X86::VANDPSZ128rmkz, X86::VANDPDZ128rmkz,
X86::VPANDQZ128rmkz, X86::VPANDDZ128rmkz },
{ X86::VANDPSZ128rrk, X86::VANDPDZ128rrk,
X86::VPANDQZ128rrk, X86::VPANDDZ128rrk },
{ X86::VANDPSZ128rrkz, X86::VANDPDZ128rrkz,
X86::VPANDQZ128rrkz, X86::VPANDDZ128rrkz },
{ X86::VORPSZ128rmk, X86::VORPDZ128rmk,
X86::VPORQZ128rmk, X86::VPORDZ128rmk },
{ X86::VORPSZ128rmkz, X86::VORPDZ128rmkz,
X86::VPORQZ128rmkz, X86::VPORDZ128rmkz },
{ X86::VORPSZ128rrk, X86::VORPDZ128rrk,
X86::VPORQZ128rrk, X86::VPORDZ128rrk },
{ X86::VORPSZ128rrkz, X86::VORPDZ128rrkz,
X86::VPORQZ128rrkz, X86::VPORDZ128rrkz },
{ X86::VXORPSZ128rmk, X86::VXORPDZ128rmk,
X86::VPXORQZ128rmk, X86::VPXORDZ128rmk },
{ X86::VXORPSZ128rmkz, X86::VXORPDZ128rmkz,
X86::VPXORQZ128rmkz, X86::VPXORDZ128rmkz },
{ X86::VXORPSZ128rrk, X86::VXORPDZ128rrk,
X86::VPXORQZ128rrk, X86::VPXORDZ128rrk },
{ X86::VXORPSZ128rrkz, X86::VXORPDZ128rrkz,
X86::VPXORQZ128rrkz, X86::VPXORDZ128rrkz },
{ X86::VANDNPSZ256rmk, X86::VANDNPDZ256rmk,
X86::VPANDNQZ256rmk, X86::VPANDNDZ256rmk },
{ X86::VANDNPSZ256rmkz, X86::VANDNPDZ256rmkz,
X86::VPANDNQZ256rmkz, X86::VPANDNDZ256rmkz },
{ X86::VANDNPSZ256rrk, X86::VANDNPDZ256rrk,
X86::VPANDNQZ256rrk, X86::VPANDNDZ256rrk },
{ X86::VANDNPSZ256rrkz, X86::VANDNPDZ256rrkz,
X86::VPANDNQZ256rrkz, X86::VPANDNDZ256rrkz },
{ X86::VANDPSZ256rmk, X86::VANDPDZ256rmk,
X86::VPANDQZ256rmk, X86::VPANDDZ256rmk },
{ X86::VANDPSZ256rmkz, X86::VANDPDZ256rmkz,
X86::VPANDQZ256rmkz, X86::VPANDDZ256rmkz },
{ X86::VANDPSZ256rrk, X86::VANDPDZ256rrk,
X86::VPANDQZ256rrk, X86::VPANDDZ256rrk },
{ X86::VANDPSZ256rrkz, X86::VANDPDZ256rrkz,
X86::VPANDQZ256rrkz, X86::VPANDDZ256rrkz },
{ X86::VORPSZ256rmk, X86::VORPDZ256rmk,
X86::VPORQZ256rmk, X86::VPORDZ256rmk },
{ X86::VORPSZ256rmkz, X86::VORPDZ256rmkz,
X86::VPORQZ256rmkz, X86::VPORDZ256rmkz },
{ X86::VORPSZ256rrk, X86::VORPDZ256rrk,
X86::VPORQZ256rrk, X86::VPORDZ256rrk },
{ X86::VORPSZ256rrkz, X86::VORPDZ256rrkz,
X86::VPORQZ256rrkz, X86::VPORDZ256rrkz },
{ X86::VXORPSZ256rmk, X86::VXORPDZ256rmk,
X86::VPXORQZ256rmk, X86::VPXORDZ256rmk },
{ X86::VXORPSZ256rmkz, X86::VXORPDZ256rmkz,
X86::VPXORQZ256rmkz, X86::VPXORDZ256rmkz },
{ X86::VXORPSZ256rrk, X86::VXORPDZ256rrk,
X86::VPXORQZ256rrk, X86::VPXORDZ256rrk },
{ X86::VXORPSZ256rrkz, X86::VXORPDZ256rrkz,
X86::VPXORQZ256rrkz, X86::VPXORDZ256rrkz },
{ X86::VANDNPSZrmk, X86::VANDNPDZrmk,
X86::VPANDNQZrmk, X86::VPANDNDZrmk },
{ X86::VANDNPSZrmkz, X86::VANDNPDZrmkz,
X86::VPANDNQZrmkz, X86::VPANDNDZrmkz },
{ X86::VANDNPSZrrk, X86::VANDNPDZrrk,
X86::VPANDNQZrrk, X86::VPANDNDZrrk },
{ X86::VANDNPSZrrkz, X86::VANDNPDZrrkz,
X86::VPANDNQZrrkz, X86::VPANDNDZrrkz },
{ X86::VANDPSZrmk, X86::VANDPDZrmk,
X86::VPANDQZrmk, X86::VPANDDZrmk },
{ X86::VANDPSZrmkz, X86::VANDPDZrmkz,
X86::VPANDQZrmkz, X86::VPANDDZrmkz },
{ X86::VANDPSZrrk, X86::VANDPDZrrk,
X86::VPANDQZrrk, X86::VPANDDZrrk },
{ X86::VANDPSZrrkz, X86::VANDPDZrrkz,
X86::VPANDQZrrkz, X86::VPANDDZrrkz },
{ X86::VORPSZrmk, X86::VORPDZrmk,
X86::VPORQZrmk, X86::VPORDZrmk },
{ X86::VORPSZrmkz, X86::VORPDZrmkz,
X86::VPORQZrmkz, X86::VPORDZrmkz },
{ X86::VORPSZrrk, X86::VORPDZrrk,
X86::VPORQZrrk, X86::VPORDZrrk },
{ X86::VORPSZrrkz, X86::VORPDZrrkz,
X86::VPORQZrrkz, X86::VPORDZrrkz },
{ X86::VXORPSZrmk, X86::VXORPDZrmk,
X86::VPXORQZrmk, X86::VPXORDZrmk },
{ X86::VXORPSZrmkz, X86::VXORPDZrmkz,
X86::VPXORQZrmkz, X86::VPXORDZrmkz },
{ X86::VXORPSZrrk, X86::VXORPDZrrk,
X86::VPXORQZrrk, X86::VPXORDZrrk },
{ X86::VXORPSZrrkz, X86::VXORPDZrrkz,
X86::VPXORQZrrkz, X86::VPXORDZrrkz },
// Broadcast loads can be handled the same as masked operations to avoid
// changing element size.
{ X86::VANDNPSZ128rmb, X86::VANDNPDZ128rmb,
X86::VPANDNQZ128rmb, X86::VPANDNDZ128rmb },
{ X86::VANDPSZ128rmb, X86::VANDPDZ128rmb,
X86::VPANDQZ128rmb, X86::VPANDDZ128rmb },
{ X86::VORPSZ128rmb, X86::VORPDZ128rmb,
X86::VPORQZ128rmb, X86::VPORDZ128rmb },
{ X86::VXORPSZ128rmb, X86::VXORPDZ128rmb,
X86::VPXORQZ128rmb, X86::VPXORDZ128rmb },
{ X86::VANDNPSZ256rmb, X86::VANDNPDZ256rmb,
X86::VPANDNQZ256rmb, X86::VPANDNDZ256rmb },
{ X86::VANDPSZ256rmb, X86::VANDPDZ256rmb,
X86::VPANDQZ256rmb, X86::VPANDDZ256rmb },
{ X86::VORPSZ256rmb, X86::VORPDZ256rmb,
X86::VPORQZ256rmb, X86::VPORDZ256rmb },
{ X86::VXORPSZ256rmb, X86::VXORPDZ256rmb,
X86::VPXORQZ256rmb, X86::VPXORDZ256rmb },
{ X86::VANDNPSZrmb, X86::VANDNPDZrmb,
X86::VPANDNQZrmb, X86::VPANDNDZrmb },
{ X86::VANDPSZrmb, X86::VANDPDZrmb,
X86::VPANDQZrmb, X86::VPANDDZrmb },
{ X86::VANDPSZrmb, X86::VANDPDZrmb,
X86::VPANDQZrmb, X86::VPANDDZrmb },
{ X86::VORPSZrmb, X86::VORPDZrmb,
X86::VPORQZrmb, X86::VPORDZrmb },
{ X86::VXORPSZrmb, X86::VXORPDZrmb,
X86::VPXORQZrmb, X86::VPXORDZrmb },
{ X86::VANDNPSZ128rmbk, X86::VANDNPDZ128rmbk,
X86::VPANDNQZ128rmbk, X86::VPANDNDZ128rmbk },
{ X86::VANDPSZ128rmbk, X86::VANDPDZ128rmbk,
X86::VPANDQZ128rmbk, X86::VPANDDZ128rmbk },
{ X86::VORPSZ128rmbk, X86::VORPDZ128rmbk,
X86::VPORQZ128rmbk, X86::VPORDZ128rmbk },
{ X86::VXORPSZ128rmbk, X86::VXORPDZ128rmbk,
X86::VPXORQZ128rmbk, X86::VPXORDZ128rmbk },
{ X86::VANDNPSZ256rmbk, X86::VANDNPDZ256rmbk,
X86::VPANDNQZ256rmbk, X86::VPANDNDZ256rmbk },
{ X86::VANDPSZ256rmbk, X86::VANDPDZ256rmbk,
X86::VPANDQZ256rmbk, X86::VPANDDZ256rmbk },
{ X86::VORPSZ256rmbk, X86::VORPDZ256rmbk,
X86::VPORQZ256rmbk, X86::VPORDZ256rmbk },
{ X86::VXORPSZ256rmbk, X86::VXORPDZ256rmbk,
X86::VPXORQZ256rmbk, X86::VPXORDZ256rmbk },
{ X86::VANDNPSZrmbk, X86::VANDNPDZrmbk,
X86::VPANDNQZrmbk, X86::VPANDNDZrmbk },
{ X86::VANDPSZrmbk, X86::VANDPDZrmbk,
X86::VPANDQZrmbk, X86::VPANDDZrmbk },
{ X86::VANDPSZrmbk, X86::VANDPDZrmbk,
X86::VPANDQZrmbk, X86::VPANDDZrmbk },
{ X86::VORPSZrmbk, X86::VORPDZrmbk,
X86::VPORQZrmbk, X86::VPORDZrmbk },
{ X86::VXORPSZrmbk, X86::VXORPDZrmbk,
X86::VPXORQZrmbk, X86::VPXORDZrmbk },
{ X86::VANDNPSZ128rmbkz,X86::VANDNPDZ128rmbkz,
X86::VPANDNQZ128rmbkz,X86::VPANDNDZ128rmbkz},
{ X86::VANDPSZ128rmbkz, X86::VANDPDZ128rmbkz,
X86::VPANDQZ128rmbkz, X86::VPANDDZ128rmbkz },
{ X86::VORPSZ128rmbkz, X86::VORPDZ128rmbkz,
X86::VPORQZ128rmbkz, X86::VPORDZ128rmbkz },
{ X86::VXORPSZ128rmbkz, X86::VXORPDZ128rmbkz,
X86::VPXORQZ128rmbkz, X86::VPXORDZ128rmbkz },
{ X86::VANDNPSZ256rmbkz,X86::VANDNPDZ256rmbkz,
X86::VPANDNQZ256rmbkz,X86::VPANDNDZ256rmbkz},
{ X86::VANDPSZ256rmbkz, X86::VANDPDZ256rmbkz,
X86::VPANDQZ256rmbkz, X86::VPANDDZ256rmbkz },
{ X86::VORPSZ256rmbkz, X86::VORPDZ256rmbkz,
X86::VPORQZ256rmbkz, X86::VPORDZ256rmbkz },
{ X86::VXORPSZ256rmbkz, X86::VXORPDZ256rmbkz,
X86::VPXORQZ256rmbkz, X86::VPXORDZ256rmbkz },
{ X86::VANDNPSZrmbkz, X86::VANDNPDZrmbkz,
X86::VPANDNQZrmbkz, X86::VPANDNDZrmbkz },
{ X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
{ X86::VANDPSZrmbkz, X86::VANDPDZrmbkz,
X86::VPANDQZrmbkz, X86::VPANDDZrmbkz },
{ X86::VORPSZrmbkz, X86::VORPDZrmbkz,
X86::VPORQZrmbkz, X86::VPORDZrmbkz },
{ X86::VXORPSZrmbkz, X86::VXORPDZrmbkz,
X86::VPXORQZrmbkz, X86::VPXORDZrmbkz },
};
// FIXME: Some shuffle and unpack instructions have equivalents in different
// domains, but they require a bit more work than just switching opcodes.
static const uint16_t *lookup(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[3]> Table) {
for (const uint16_t (&Row)[3] : Table)
if (Row[domain-1] == opcode)
return Row;
return nullptr;
}
static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain,
ArrayRef<uint16_t[4]> Table) {
// If this is the integer domain make sure to check both integer columns.
for (const uint16_t (&Row)[4] : Table)
if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode))
return Row;
return nullptr;
}
std::pair<uint16_t, uint16_t>
X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
unsigned opcode = MI.getOpcode();
uint16_t validDomains = 0;
if (domain) {
if (lookup(MI.getOpcode(), domain, ReplaceableInstrs)) {
validDomains = 0xe;
} else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
} else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
validDomains = 0xe;
} else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
} else if (const uint16_t *table = lookupAVX512(opcode, domain,
ReplaceableInstrsAVX512DQMasked)) {
if (domain == 1 || (domain == 3 && table[3] == opcode))
validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
else
validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
}
}
return std::make_pair(domain, validDomains);
}
void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
assert(Domain>0 && Domain<4 && "Invalid execution domain");
uint16_t dom = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
assert(dom && "Not an SSE instruction");
const uint16_t *table = lookup(MI.getOpcode(), dom, ReplaceableInstrs);
if (!table) { // try the other table
assert((Subtarget.hasAVX2() || Domain < 3) &&
"256-bit vector operations only available in AVX2");
table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
}
if (!table) { // try the AVX512 table
assert(Subtarget.hasAVX512() && "Requires AVX-512");
table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
// Don't change integer Q instructions to D instructions.
if (table && Domain == 3 && table[3] == MI.getOpcode())
Domain = 4;
}
if (!table) { // try the AVX512DQ table
assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
// Don't change integer Q instructions to D instructions and
// use D intructions if we started with a PS instruction.
if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
Domain = 4;
}
if (!table) { // try the AVX512DQMasked table
assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQMasked);
if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
Domain = 4;
}
assert(table && "Cannot change domain");
MI.setDesc(get(table[Domain - 1]));
}
/// Return the noop instruction to use for a noop.
void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
NopInst.setOpcode(X86::NOOP);
}
bool X86InstrInfo::isHighLatencyDef(int opc) const {
switch (opc) {
default: return false;
case X86::DIVPDrm:
case X86::DIVPDrr:
case X86::DIVPSrm:
case X86::DIVPSrr:
case X86::DIVSDrm:
case X86::DIVSDrm_Int:
case X86::DIVSDrr:
case X86::DIVSDrr_Int:
case X86::DIVSSrm:
case X86::DIVSSrm_Int:
case X86::DIVSSrr:
case X86::DIVSSrr_Int:
case X86::SQRTPDm:
case X86::SQRTPDr:
case X86::SQRTPSm:
case X86::SQRTPSr:
case X86::SQRTSDm:
case X86::SQRTSDm_Int:
case X86::SQRTSDr:
case X86::SQRTSDr_Int:
case X86::SQRTSSm:
case X86::SQRTSSm_Int:
case X86::SQRTSSr:
case X86::SQRTSSr_Int:
// AVX instructions with high latency
case X86::VDIVPDrm:
case X86::VDIVPDrr:
case X86::VDIVPDYrm:
case X86::VDIVPDYrr:
case X86::VDIVPSrm:
case X86::VDIVPSrr:
case X86::VDIVPSYrm:
case X86::VDIVPSYrr:
case X86::VDIVSDrm:
case X86::VDIVSDrm_Int:
case X86::VDIVSDrr:
case X86::VDIVSDrr_Int:
case X86::VDIVSSrm:
case X86::VDIVSSrm_Int:
case X86::VDIVSSrr:
case X86::VDIVSSrr_Int:
case X86::VSQRTPDm:
case X86::VSQRTPDr:
case X86::VSQRTPDYm:
case X86::VSQRTPDYr:
case X86::VSQRTPSm:
case X86::VSQRTPSr:
case X86::VSQRTPSYm:
case X86::VSQRTPSYr:
case X86::VSQRTSDm:
case X86::VSQRTSDm_Int:
case X86::VSQRTSDr:
case X86::VSQRTSDr_Int:
case X86::VSQRTSSm:
case X86::VSQRTSSm_Int:
case X86::VSQRTSSr:
case X86::VSQRTSSr_Int:
// AVX512 instructions with high latency
case X86::VDIVPDZ128rm:
case X86::VDIVPDZ128rmb:
case X86::VDIVPDZ128rmbk:
case X86::VDIVPDZ128rmbkz:
case X86::VDIVPDZ128rmk:
case X86::VDIVPDZ128rmkz:
case X86::VDIVPDZ128rr:
case X86::VDIVPDZ128rrk:
case X86::VDIVPDZ128rrkz:
case X86::VDIVPDZ256rm:
case X86::VDIVPDZ256rmb:
case X86::VDIVPDZ256rmbk:
case X86::VDIVPDZ256rmbkz:
case X86::VDIVPDZ256rmk:
case X86::VDIVPDZ256rmkz:
case X86::VDIVPDZ256rr:
case X86::VDIVPDZ256rrk:
case X86::VDIVPDZ256rrkz:
case X86::VDIVPDZrb:
case X86::VDIVPDZrbk:
case X86::VDIVPDZrbkz:
case X86::VDIVPDZrm:
case X86::VDIVPDZrmb:
case X86::VDIVPDZrmbk:
case X86::VDIVPDZrmbkz:
case X86::VDIVPDZrmk:
case X86::VDIVPDZrmkz:
case X86::VDIVPDZrr:
case X86::VDIVPDZrrk:
case X86::VDIVPDZrrkz:
case X86::VDIVPSZ128rm:
case X86::VDIVPSZ128rmb:
case X86::VDIVPSZ128rmbk:
case X86::VDIVPSZ128rmbkz:
case X86::VDIVPSZ128rmk:
case X86::VDIVPSZ128rmkz:
case X86::VDIVPSZ128rr:
case X86::VDIVPSZ128rrk:
case X86::VDIVPSZ128rrkz:
case X86::VDIVPSZ256rm:
case X86::VDIVPSZ256rmb:
case X86::VDIVPSZ256rmbk:
case X86::VDIVPSZ256rmbkz:
case X86::VDIVPSZ256rmk:
case X86::VDIVPSZ256rmkz:
case X86::VDIVPSZ256rr:
case X86::VDIVPSZ256rrk:
case X86::VDIVPSZ256rrkz:
case X86::VDIVPSZrb:
case X86::VDIVPSZrbk:
case X86::VDIVPSZrbkz:
case X86::VDIVPSZrm:
case X86::VDIVPSZrmb:
case X86::VDIVPSZrmbk:
case X86::VDIVPSZrmbkz:
case X86::VDIVPSZrmk:
case X86::VDIVPSZrmkz:
case X86::VDIVPSZrr:
case X86::VDIVPSZrrk:
case X86::VDIVPSZrrkz:
case X86::VDIVSDZrm:
case X86::VDIVSDZrr:
case X86::VDIVSDZrm_Int:
case X86::VDIVSDZrm_Intk:
case X86::VDIVSDZrm_Intkz:
case X86::VDIVSDZrr_Int:
case X86::VDIVSDZrr_Intk:
case X86::VDIVSDZrr_Intkz:
case X86::VDIVSDZrrb:
case X86::VDIVSDZrrbk:
case X86::VDIVSDZrrbkz:
case X86::VDIVSSZrm:
case X86::VDIVSSZrr:
case X86::VDIVSSZrm_Int:
case X86::VDIVSSZrm_Intk:
case X86::VDIVSSZrm_Intkz:
case X86::VDIVSSZrr_Int:
case X86::VDIVSSZrr_Intk:
case X86::VDIVSSZrr_Intkz:
case X86::VDIVSSZrrb:
case X86::VDIVSSZrrbk:
case X86::VDIVSSZrrbkz:
case X86::VSQRTPDZ128m:
case X86::VSQRTPDZ128mb:
case X86::VSQRTPDZ128mbk:
case X86::VSQRTPDZ128mbkz:
case X86::VSQRTPDZ128mk:
case X86::VSQRTPDZ128mkz:
case X86::VSQRTPDZ128r:
case X86::VSQRTPDZ128rk:
case X86::VSQRTPDZ128rkz:
case X86::VSQRTPDZ256m:
case X86::VSQRTPDZ256mb:
case X86::VSQRTPDZ256mbk:
case X86::VSQRTPDZ256mbkz:
case X86::VSQRTPDZ256mk:
case X86::VSQRTPDZ256mkz:
case X86::VSQRTPDZ256r:
case X86::VSQRTPDZ256rk:
case X86::VSQRTPDZ256rkz:
case X86::VSQRTPDZm:
case X86::VSQRTPDZmb:
case X86::VSQRTPDZmbk:
case X86::VSQRTPDZmbkz:
case X86::VSQRTPDZmk:
case X86::VSQRTPDZmkz:
case X86::VSQRTPDZr:
case X86::VSQRTPDZrb:
case X86::VSQRTPDZrbk:
case X86::VSQRTPDZrbkz:
case X86::VSQRTPDZrk:
case X86::VSQRTPDZrkz:
case X86::VSQRTPSZ128m:
case X86::VSQRTPSZ128mb:
case X86::VSQRTPSZ128mbk:
case X86::VSQRTPSZ128mbkz:
case X86::VSQRTPSZ128mk:
case X86::VSQRTPSZ128mkz:
case X86::VSQRTPSZ128r:
case X86::VSQRTPSZ128rk:
case X86::VSQRTPSZ128rkz:
case X86::VSQRTPSZ256m:
case X86::VSQRTPSZ256mb:
case X86::VSQRTPSZ256mbk:
case X86::VSQRTPSZ256mbkz:
case X86::VSQRTPSZ256mk:
case X86::VSQRTPSZ256mkz:
case X86::VSQRTPSZ256r:
case X86::VSQRTPSZ256rk:
case X86::VSQRTPSZ256rkz:
case X86::VSQRTPSZm:
case X86::VSQRTPSZmb:
case X86::VSQRTPSZmbk:
case X86::VSQRTPSZmbkz:
case X86::VSQRTPSZmk:
case X86::VSQRTPSZmkz:
case X86::VSQRTPSZr:
case X86::VSQRTPSZrb:
case X86::VSQRTPSZrbk:
case X86::VSQRTPSZrbkz:
case X86::VSQRTPSZrk:
case X86::VSQRTPSZrkz:
case X86::VSQRTSDZm:
case X86::VSQRTSDZm_Int:
case X86::VSQRTSDZm_Intk:
case X86::VSQRTSDZm_Intkz:
case X86::VSQRTSDZr:
case X86::VSQRTSDZr_Int:
case X86::VSQRTSDZr_Intk:
case X86::VSQRTSDZr_Intkz:
case X86::VSQRTSDZrb_Int:
case X86::VSQRTSDZrb_Intk:
case X86::VSQRTSDZrb_Intkz:
case X86::VSQRTSSZm:
case X86::VSQRTSSZm_Int:
case X86::VSQRTSSZm_Intk:
case X86::VSQRTSSZm_Intkz:
case X86::VSQRTSSZr:
case X86::VSQRTSSZr_Int:
case X86::VSQRTSSZr_Intk:
case X86::VSQRTSSZr_Intkz:
case X86::VSQRTSSZrb_Int:
case X86::VSQRTSSZrb_Intk:
case X86::VSQRTSSZrb_Intkz:
case X86::VGATHERDPDYrm:
case X86::VGATHERDPDZ128rm:
case X86::VGATHERDPDZ256rm:
case X86::VGATHERDPDZrm:
case X86::VGATHERDPDrm:
case X86::VGATHERDPSYrm:
case X86::VGATHERDPSZ128rm:
case X86::VGATHERDPSZ256rm:
case X86::VGATHERDPSZrm:
case X86::VGATHERDPSrm:
case X86::VGATHERPF0DPDm:
case X86::VGATHERPF0DPSm:
case X86::VGATHERPF0QPDm:
case X86::VGATHERPF0QPSm:
case X86::VGATHERPF1DPDm:
case X86::VGATHERPF1DPSm:
case X86::VGATHERPF1QPDm:
case X86::VGATHERPF1QPSm:
case X86::VGATHERQPDYrm:
case X86::VGATHERQPDZ128rm:
case X86::VGATHERQPDZ256rm:
case X86::VGATHERQPDZrm:
case X86::VGATHERQPDrm:
case X86::VGATHERQPSYrm:
case X86::VGATHERQPSZ128rm:
case X86::VGATHERQPSZ256rm:
case X86::VGATHERQPSZrm:
case X86::VGATHERQPSrm:
case X86::VPGATHERDDYrm:
case X86::VPGATHERDDZ128rm:
case X86::VPGATHERDDZ256rm:
case X86::VPGATHERDDZrm:
case X86::VPGATHERDDrm:
case X86::VPGATHERDQYrm:
case X86::VPGATHERDQZ128rm:
case X86::VPGATHERDQZ256rm:
case X86::VPGATHERDQZrm:
case X86::VPGATHERDQrm:
case X86::VPGATHERQDYrm:
case X86::VPGATHERQDZ128rm:
case X86::VPGATHERQDZ256rm:
case X86::VPGATHERQDZrm:
case X86::VPGATHERQDrm:
case X86::VPGATHERQQYrm:
case X86::VPGATHERQQZ128rm:
case X86::VPGATHERQQZ256rm:
case X86::VPGATHERQQZrm:
case X86::VPGATHERQQrm:
case X86::VSCATTERDPDZ128mr:
case X86::VSCATTERDPDZ256mr:
case X86::VSCATTERDPDZmr:
case X86::VSCATTERDPSZ128mr:
case X86::VSCATTERDPSZ256mr:
case X86::VSCATTERDPSZmr:
case X86::VSCATTERPF0DPDm:
case X86::VSCATTERPF0DPSm:
case X86::VSCATTERPF0QPDm:
case X86::VSCATTERPF0QPSm:
case X86::VSCATTERPF1DPDm:
case X86::VSCATTERPF1DPSm:
case X86::VSCATTERPF1QPDm:
case X86::VSCATTERPF1QPSm:
case X86::VSCATTERQPDZ128mr:
case X86::VSCATTERQPDZ256mr:
case X86::VSCATTERQPDZmr:
case X86::VSCATTERQPSZ128mr:
case X86::VSCATTERQPSZ256mr:
case X86::VSCATTERQPSZmr:
case X86::VPSCATTERDDZ128mr:
case X86::VPSCATTERDDZ256mr:
case X86::VPSCATTERDDZmr:
case X86::VPSCATTERDQZ128mr:
case X86::VPSCATTERDQZ256mr:
case X86::VPSCATTERDQZmr:
case X86::VPSCATTERQDZ128mr:
case X86::VPSCATTERQDZ256mr:
case X86::VPSCATTERQDZmr:
case X86::VPSCATTERQQZ128mr:
case X86::VPSCATTERQQZ256mr:
case X86::VPSCATTERQQZmr:
return true;
}
}
bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
const MachineRegisterInfo *MRI,
const MachineInstr &DefMI,
unsigned DefIdx,
const MachineInstr &UseMI,
unsigned UseIdx) const {
return isHighLatencyDef(DefMI.getOpcode());
}
bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
const MachineBasicBlock *MBB) const {
assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
"Reassociation needs binary operators");
// Integer binary math/logic instructions have a third source operand:
// the EFLAGS register. That operand must be both defined here and never
// used; ie, it must be dead. If the EFLAGS operand is live, then we can
// not change anything because rearranging the operands could affect other
// instructions that depend on the exact status flags (zero, sign, etc.)
// that are set by using these particular operands with this operation.
if (Inst.getNumOperands() == 4) {
assert(Inst.getOperand(3).isReg() &&
Inst.getOperand(3).getReg() == X86::EFLAGS &&
"Unexpected operand in reassociable instruction");
if (!Inst.getOperand(3).isDead())
return false;
}
return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
}
// TODO: There are many more machine instruction opcodes to match:
// 1. Other data types (integer, vectors)
// 2. Other math / logic operations (xor, or)
// 3. Other forms of the same operation (intrinsics and other variants)
bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
switch (Inst.getOpcode()) {
case X86::AND8rr:
case X86::AND16rr:
case X86::AND32rr:
case X86::AND64rr:
case X86::OR8rr:
case X86::OR16rr:
case X86::OR32rr:
case X86::OR64rr:
case X86::XOR8rr:
case X86::XOR16rr:
case X86::XOR32rr:
case X86::XOR64rr:
case X86::IMUL16rr:
case X86::IMUL32rr:
case X86::IMUL64rr:
case X86::PANDrr:
case X86::PORrr:
case X86::PXORrr:
case X86::ANDPDrr:
case X86::ANDPSrr:
case X86::ORPDrr:
case X86::ORPSrr:
case X86::XORPDrr:
case X86::XORPSrr:
case X86::PADDBrr:
case X86::PADDWrr:
case X86::PADDDrr:
case X86::PADDQrr:
case X86::VPANDrr:
case X86::VPANDYrr:
case X86::VPANDDZ128rr:
case X86::VPANDDZ256rr:
case X86::VPANDDZrr:
case X86::VPANDQZ128rr:
case X86::VPANDQZ256rr:
case X86::VPANDQZrr:
case X86::VPORrr:
case X86::VPORYrr:
case X86::VPORDZ128rr:
case X86::VPORDZ256rr:
case X86::VPORDZrr:
case X86::VPORQZ128rr:
case X86::VPORQZ256rr:
case X86::VPORQZrr:
case X86::VPXORrr:
case X86::VPXORYrr:
case X86::VPXORDZ128rr:
case X86::VPXORDZ256rr:
case X86::VPXORDZrr:
case X86::VPXORQZ128rr:
case X86::VPXORQZ256rr:
case X86::VPXORQZrr:
case X86::VANDPDrr:
case X86::VANDPSrr:
case X86::VANDPDYrr:
case X86::VANDPSYrr:
case X86::VANDPDZ128rr:
case X86::VANDPSZ128rr:
case X86::VANDPDZ256rr:
case X86::VANDPSZ256rr:
case X86::VANDPDZrr:
case X86::VANDPSZrr:
case X86::VORPDrr:
case X86::VORPSrr:
case X86::VORPDYrr:
case X86::VORPSYrr:
case X86::VORPDZ128rr:
case X86::VORPSZ128rr:
case X86::VORPDZ256rr:
case X86::VORPSZ256rr:
case X86::VORPDZrr:
case X86::VORPSZrr:
case X86::VXORPDrr:
case X86::VXORPSrr:
case X86::VXORPDYrr:
case X86::VXORPSYrr:
case X86::VXORPDZ128rr:
case X86::VXORPSZ128rr:
case X86::VXORPDZ256rr:
case X86::VXORPSZ256rr:
case X86::VXORPDZrr:
case X86::VXORPSZrr:
case X86::KADDBrr:
case X86::KADDWrr:
case X86::KADDDrr:
case X86::KADDQrr:
case X86::KANDBrr:
case X86::KANDWrr:
case X86::KANDDrr:
case X86::KANDQrr:
case X86::KORBrr:
case X86::KORWrr:
case X86::KORDrr:
case X86::KORQrr:
case X86::KXORBrr:
case X86::KXORWrr:
case X86::KXORDrr:
case X86::KXORQrr:
case X86::VPADDBrr:
case X86::VPADDWrr:
case X86::VPADDDrr:
case X86::VPADDQrr:
case X86::VPADDBYrr:
case X86::VPADDWYrr:
case X86::VPADDDYrr:
case X86::VPADDQYrr:
case X86::VPADDBZ128rr:
case X86::VPADDWZ128rr:
case X86::VPADDDZ128rr:
case X86::VPADDQZ128rr:
case X86::VPADDBZ256rr:
case X86::VPADDWZ256rr:
case X86::VPADDDZ256rr:
case X86::VPADDQZ256rr:
case X86::VPADDBZrr:
case X86::VPADDWZrr:
case X86::VPADDDZrr:
case X86::VPADDQZrr:
case X86::VPMULLWrr:
case X86::VPMULLWYrr:
case X86::VPMULLWZ128rr:
case X86::VPMULLWZ256rr:
case X86::VPMULLWZrr:
case X86::VPMULLDrr:
case X86::VPMULLDYrr:
case X86::VPMULLDZ128rr:
case X86::VPMULLDZ256rr:
case X86::VPMULLDZrr:
case X86::VPMULLQZ128rr:
case X86::VPMULLQZ256rr:
case X86::VPMULLQZrr:
// Normal min/max instructions are not commutative because of NaN and signed
// zero semantics, but these are. Thus, there's no need to check for global
// relaxed math; the instructions themselves have the properties we need.
case X86::MAXCPDrr:
case X86::MAXCPSrr:
case X86::MAXCSDrr:
case X86::MAXCSSrr:
case X86::MINCPDrr:
case X86::MINCPSrr:
case X86::MINCSDrr:
case X86::MINCSSrr:
case X86::VMAXCPDrr:
case X86::VMAXCPSrr:
case X86::VMAXCPDYrr:
case X86::VMAXCPSYrr:
case X86::VMAXCPDZ128rr:
case X86::VMAXCPSZ128rr:
case X86::VMAXCPDZ256rr:
case X86::VMAXCPSZ256rr:
case X86::VMAXCPDZrr:
case X86::VMAXCPSZrr:
case X86::VMAXCSDrr:
case X86::VMAXCSSrr:
case X86::VMAXCSDZrr:
case X86::VMAXCSSZrr:
case X86::VMINCPDrr:
case X86::VMINCPSrr:
case X86::VMINCPDYrr:
case X86::VMINCPSYrr:
case X86::VMINCPDZ128rr:
case X86::VMINCPSZ128rr:
case X86::VMINCPDZ256rr:
case X86::VMINCPSZ256rr:
case X86::VMINCPDZrr:
case X86::VMINCPSZrr:
case X86::VMINCSDrr:
case X86::VMINCSSrr:
case X86::VMINCSDZrr:
case X86::VMINCSSZrr:
return true;
case X86::ADDPDrr:
case X86::ADDPSrr:
case X86::ADDSDrr:
case X86::ADDSSrr:
case X86::MULPDrr:
case X86::MULPSrr:
case X86::MULSDrr:
case X86::MULSSrr:
case X86::VADDPDrr:
case X86::VADDPSrr:
case X86::VADDPDYrr:
case X86::VADDPSYrr:
case X86::VADDPDZ128rr:
case X86::VADDPSZ128rr:
case X86::VADDPDZ256rr:
case X86::VADDPSZ256rr:
case X86::VADDPDZrr:
case X86::VADDPSZrr:
case X86::VADDSDrr:
case X86::VADDSSrr:
case X86::VADDSDZrr:
case X86::VADDSSZrr:
case X86::VMULPDrr:
case X86::VMULPSrr:
case X86::VMULPDYrr:
case X86::VMULPSYrr:
case X86::VMULPDZ128rr:
case X86::VMULPSZ128rr:
case X86::VMULPDZ256rr:
case X86::VMULPSZ256rr:
case X86::VMULPDZrr:
case X86::VMULPSZrr:
case X86::VMULSDrr:
case X86::VMULSSrr:
case X86::VMULSDZrr:
case X86::VMULSSZrr:
return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
default:
return false;
}
}
/// This is an architecture-specific helper function of reassociateOps.
/// Set special operand attributes for new instructions after reassociation.
void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
MachineInstr &OldMI2,
MachineInstr &NewMI1,
MachineInstr &NewMI2) const {
// Integer instructions define an implicit EFLAGS source register operand as
// the third source (fourth total) operand.
if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
return;
assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
"Unexpected instruction type for reassociation");
MachineOperand &OldOp1 = OldMI1.getOperand(3);
MachineOperand &OldOp2 = OldMI2.getOperand(3);
MachineOperand &NewOp1 = NewMI1.getOperand(3);
MachineOperand &NewOp2 = NewMI2.getOperand(3);
assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
"Must have dead EFLAGS operand in reassociable instruction");
assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
"Must have dead EFLAGS operand in reassociable instruction");
(void)OldOp1;
(void)OldOp2;
assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
"Unexpected operand in reassociable instruction");
assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
"Unexpected operand in reassociable instruction");
// Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
// of this pass or other passes. The EFLAGS operands must be dead in these new
// instructions because the EFLAGS operands in the original instructions must
// be dead in order for reassociation to occur.
NewOp1.setIsDead();
NewOp2.setIsDead();
}
std::pair<unsigned, unsigned>
X86InstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const {
return std::make_pair(TF, 0u);
}
ArrayRef<std::pair<unsigned, const char *>>
X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
using namespace X86II;
static const std::pair<unsigned, const char *> TargetFlags[] = {
{MO_GOT_ABSOLUTE_ADDRESS, "x86-got-absolute-address"},
{MO_PIC_BASE_OFFSET, "x86-pic-base-offset"},
{MO_GOT, "x86-got"},
{MO_GOTOFF, "x86-gotoff"},
{MO_GOTPCREL, "x86-gotpcrel"},
{MO_PLT, "x86-plt"},
{MO_TLSGD, "x86-tlsgd"},
{MO_TLSLD, "x86-tlsld"},
{MO_TLSLDM, "x86-tlsldm"},
{MO_GOTTPOFF, "x86-gottpoff"},
{MO_INDNTPOFF, "x86-indntpoff"},
{MO_TPOFF, "x86-tpoff"},
{MO_DTPOFF, "x86-dtpoff"},
{MO_NTPOFF, "x86-ntpoff"},
{MO_GOTNTPOFF, "x86-gotntpoff"},
{MO_DLLIMPORT, "x86-dllimport"},
{MO_DARWIN_NONLAZY, "x86-darwin-nonlazy"},
{MO_DARWIN_NONLAZY_PIC_BASE, "x86-darwin-nonlazy-pic-base"},
{MO_TLVP, "x86-tlvp"},
{MO_TLVP_PIC_BASE, "x86-tlvp-pic-base"},
{MO_SECREL, "x86-secrel"}};
return makeArrayRef(TargetFlags);
}
bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
switch (Inst.getOpcode()) {
case X86::TCRETURNdi:
case X86::TCRETURNmi:
case X86::TCRETURNri:
case X86::TCRETURNdi64:
case X86::TCRETURNmi64:
case X86::TCRETURNri64:
case X86::TAILJMPd:
case X86::TAILJMPm:
case X86::TAILJMPr:
case X86::TAILJMPd64:
case X86::TAILJMPm64:
case X86::TAILJMPr64:
case X86::TAILJMPm64_REX:
case X86::TAILJMPr64_REX:
return true;
default:
return false;
}
}
namespace {
/// Create Global Base Reg pass. This initializes the PIC
/// global base register for x86-32.
struct CGBR : public MachineFunctionPass {
static char ID;
CGBR() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
const X86TargetMachine *TM =
static_cast<const X86TargetMachine *>(&MF.getTarget());
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
// Don't do anything if this is 64-bit as 64-bit PIC
// uses RIP relative addressing.
if (STI.is64Bit())
return false;
// Only emit a global base reg in PIC mode.
if (!TM->isPositionIndependent())
return false;
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
// If we didn't need a GlobalBaseReg, don't insert code.
if (GlobalBaseReg == 0)
return false;
// Insert the set of GlobalBaseReg into the first MBB of the function
MachineBasicBlock &FirstMBB = MF.front();
MachineBasicBlock::iterator MBBI = FirstMBB.begin();
DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
MachineRegisterInfo &RegInfo = MF.getRegInfo();
const X86InstrInfo *TII = STI.getInstrInfo();
unsigned PC;
if (STI.isPICStyleGOT())
PC = RegInfo.createVirtualRegister(&X86::GR32RegClass);
else
PC = GlobalBaseReg;
// Operand of MovePCtoStack is completely ignored by asm printer. It's
// only used in JIT code emission as displacement to pc.
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
// If we're using vanilla 'GOT' PIC style, we should use relative addressing
// not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
if (STI.isPICStyleGOT()) {
// Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
.addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
X86II::MO_GOT_ABSOLUTE_ADDRESS);
}
return true;
}
StringRef getPassName() const override {
return "X86 PIC Global Base Reg Initialization";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
}
char CGBR::ID = 0;
FunctionPass*
llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
namespace {
struct LDTLSCleanup : public MachineFunctionPass {
static char ID;
LDTLSCleanup() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
if (skipFunction(*MF.getFunction()))
return false;
X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
// No point folding accesses if there isn't at least two.
return false;
}
MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
return VisitNode(DT->getRootNode(), 0);
}
// Visit the dominator subtree rooted at Node in pre-order.
// If TLSBaseAddrReg is non-null, then use that to replace any
// TLS_base_addr instructions. Otherwise, create the register
// when the first such instruction is seen, and then use it
// as we encounter more instructions.
bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
MachineBasicBlock *BB = Node->getBlock();
bool Changed = false;
// Traverse the current block.
for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
++I) {
switch (I->getOpcode()) {
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
if (TLSBaseAddrReg)
I = ReplaceTLSBaseAddrCall(*I, TLSBaseAddrReg);
else
I = SetRegister(*I, &TLSBaseAddrReg);
Changed = true;
break;
default:
break;
}
}
// Visit the children of this block in the dominator tree.
for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
I != E; ++I) {
Changed |= VisitNode(*I, TLSBaseAddrReg);
}
return Changed;
}
// Replace the TLS_base_addr instruction I with a copy from
// TLSBaseAddrReg, returning the new instruction.
MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr &I,
unsigned TLSBaseAddrReg) {
MachineFunction *MF = I.getParent()->getParent();
const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
const bool is64Bit = STI.is64Bit();
const X86InstrInfo *TII = STI.getInstrInfo();
// Insert a Copy from TLSBaseAddrReg to RAX/EAX.
MachineInstr *Copy =
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII->get(TargetOpcode::COPY), is64Bit ? X86::RAX : X86::EAX)
.addReg(TLSBaseAddrReg);
// Erase the TLS_base_addr instruction.
I.eraseFromParent();
return Copy;
}
// Create a virtal register in *TLSBaseAddrReg, and populate it by
// inserting a copy instruction after I. Returns the new instruction.
MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) {
MachineFunction *MF = I.getParent()->getParent();
const X86Subtarget &STI = MF->getSubtarget<X86Subtarget>();
const bool is64Bit = STI.is64Bit();
const X86InstrInfo *TII = STI.getInstrInfo();
// Create a virtual register for the TLS base address.
MachineRegisterInfo &RegInfo = MF->getRegInfo();
*TLSBaseAddrReg = RegInfo.createVirtualRegister(is64Bit
? &X86::GR64RegClass
: &X86::GR32RegClass);
// Insert a copy from RAX/EAX to TLSBaseAddrReg.
MachineInstr *Next = I.getNextNode();
MachineInstr *Copy =
BuildMI(*I.getParent(), Next, I.getDebugLoc(),
TII->get(TargetOpcode::COPY), *TLSBaseAddrReg)
.addReg(is64Bit ? X86::RAX : X86::EAX);
return Copy;
}
StringRef getPassName() const override {
return "Local Dynamic TLS Access Clean-up";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addRequired<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
};
}
char LDTLSCleanup::ID = 0;
FunctionPass*
llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86InstrInfo.h (revision 313643)
@@ -1,608 +1,601 @@
//===-- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the X86 implementation of the TargetInstrInfo class.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_LIB_TARGET_X86_X86INSTRINFO_H
#define LLVM_LIB_TARGET_X86_X86INSTRINFO_H
#include "MCTargetDesc/X86BaseInfo.h"
#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/Target/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "X86GenInstrInfo.inc"
namespace llvm {
class MachineInstrBuilder;
class X86RegisterInfo;
class X86Subtarget;
namespace X86 {
// X86 specific condition code. These correspond to X86_*_COND in
// X86InstrInfo.td. They must be kept in synch.
enum CondCode {
COND_A = 0,
COND_AE = 1,
COND_B = 2,
COND_BE = 3,
COND_E = 4,
COND_G = 5,
COND_GE = 6,
COND_L = 7,
COND_LE = 8,
COND_NE = 9,
COND_NO = 10,
COND_NP = 11,
COND_NS = 12,
COND_O = 13,
COND_P = 14,
COND_S = 15,
LAST_VALID_COND = COND_S,
// Artificial condition codes. These are used by AnalyzeBranch
// to indicate a block terminated with two conditional branches that together
// form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
// which can't be represented on x86 with a single condition. These
// are never used in MachineInstrs and are inverses of one another.
COND_NE_OR_P,
COND_E_AND_NP,
COND_INVALID
};
// Turn condition code into conditional branch opcode.
unsigned GetCondBranchFromCond(CondCode CC);
/// \brief Return a set opcode for the given condition and whether it has
/// a memory operand.
unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false);
/// \brief Return a cmov opcode for the given condition, register size in
/// bytes, and operand type.
unsigned getCMovFromCond(CondCode CC, unsigned RegBytes,
bool HasMemoryOperand = false);
// Turn CMov opcode into condition code.
CondCode getCondFromCMovOpc(unsigned Opc);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
CondCode GetOppositeBranchCondition(CondCode CC);
} // end namespace X86;
/// isGlobalStubReference - Return true if the specified TargetFlag operand is
/// a reference to a stub for a global, not the global itself.
inline static bool isGlobalStubReference(unsigned char TargetFlag) {
switch (TargetFlag) {
case X86II::MO_DLLIMPORT: // dllimport stub.
case X86II::MO_GOTPCREL: // rip-relative GOT reference.
case X86II::MO_GOT: // normal GOT reference.
case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
return true;
default:
return false;
}
}
/// isGlobalRelativeToPICBase - Return true if the specified global value
/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg). If this
/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
switch (TargetFlag) {
case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
case X86II::MO_GOT: // isPICStyleGOT: other global.
case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
case X86II::MO_TLVP: // ??? Pretty sure..
return true;
default:
return false;
}
}
inline static bool isScale(const MachineOperand &MO) {
return MO.isImm() &&
(MO.getImm() == 1 || MO.getImm() == 2 ||
MO.getImm() == 4 || MO.getImm() == 8);
}
inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
if (MI.getOperand(Op).isFI())
return true;
return Op + X86::AddrSegmentReg <= MI.getNumOperands() &&
MI.getOperand(Op + X86::AddrBaseReg).isReg() &&
isScale(MI.getOperand(Op + X86::AddrScaleAmt)) &&
MI.getOperand(Op + X86::AddrIndexReg).isReg() &&
(MI.getOperand(Op + X86::AddrDisp).isImm() ||
MI.getOperand(Op + X86::AddrDisp).isGlobal() ||
MI.getOperand(Op + X86::AddrDisp).isCPI() ||
MI.getOperand(Op + X86::AddrDisp).isJTI());
}
inline static bool isMem(const MachineInstr &MI, unsigned Op) {
if (MI.getOperand(Op).isFI())
return true;
return Op + X86::AddrNumOperands <= MI.getNumOperands() &&
MI.getOperand(Op + X86::AddrSegmentReg).isReg() && isLeaMem(MI, Op);
}
class X86InstrInfo final : public X86GenInstrInfo {
X86Subtarget &Subtarget;
const X86RegisterInfo RI;
/// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
/// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
///
typedef DenseMap<unsigned,
std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
RegOp2MemOpTableType RegOp2MemOpTable2Addr;
RegOp2MemOpTableType RegOp2MemOpTable0;
RegOp2MemOpTableType RegOp2MemOpTable1;
RegOp2MemOpTableType RegOp2MemOpTable2;
RegOp2MemOpTableType RegOp2MemOpTable3;
RegOp2MemOpTableType RegOp2MemOpTable4;
/// MemOp2RegOpTable - Load / store unfolding opcode map.
///
typedef DenseMap<unsigned,
std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
MemOp2RegOpTableType MemOp2RegOpTable;
static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
MemOp2RegOpTableType &M2RTable,
uint16_t RegOp, uint16_t MemOp, uint16_t Flags);
virtual void anchor();
bool AnalyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
SmallVectorImpl<MachineInstr *> &CondBranches,
bool AllowModify) const;
public:
explicit X86InstrInfo(X86Subtarget &STI);
/// getRegisterInfo - TargetInstrInfo is a superset of MRegister info. As
/// such, whenever a client has an instance of instruction info, it should
/// always be able to get register info as well (through this method).
///
const X86RegisterInfo &getRegisterInfo() const { return RI; }
/// getSPAdjust - This returns the stack pointer adjustment made by
/// this instruction. For x86, we need to handle more complex call
/// sequences involving PUSHes.
int getSPAdjust(const MachineInstr &MI) const override;
/// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
/// extension instruction. That is, it's like a copy where it's legal for the
/// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
/// true, then it's expected the pre-extension value is available as a subreg
/// of the result register. This also returns the sub-register index in
/// SubIdx.
bool isCoalescableExtInstr(const MachineInstr &MI,
unsigned &SrcReg, unsigned &DstReg,
unsigned &SubIdx) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
/// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
/// reliable for correctness.
unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const override;
unsigned isStoreToStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
/// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
/// stack locations as well. This uses a heuristic so it isn't
/// reliable for correctness.
unsigned isStoreToStackSlotPostFE(const MachineInstr &MI,
int &FrameIndex) const override;
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
AliasAnalysis *AA) const override;
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
unsigned DestReg, unsigned SubIdx,
const MachineInstr &Orig,
const TargetRegisterInfo &TRI) const override;
/// Given an operand within a MachineInstr, insert preceding code to put it
/// into the right format for a particular kind of LEA instruction. This may
/// involve using an appropriate super-register instead (with an implicit use
/// of the original) or creating a new virtual register and inserting COPY
/// instructions to get the data into the right class.
///
/// Reference parameters are set to indicate how caller should add this
/// operand to the LEA instruction.
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
bool &isKill, bool &isUndef,
MachineOperand &ImplicitOp, LiveVariables *LV) const;
/// convertToThreeAddress - This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
/// may be able to convert a two-address instruction into a true
/// three-address instruction on demand. This allows the X86 target (for
/// example) to convert ADD and SHL instructions into LEA instructions if they
/// would require register copies due to two-addressness.
///
/// This method returns a null pointer if the transformation cannot be
/// performed, otherwise it returns the new instruction.
///
MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
MachineInstr &MI,
LiveVariables *LV) const override;
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
/// input values can be re-defined in this method only if the input values
/// are not pre-defined, which is designated by the special value
/// 'CommuteAnyOperandIndex' assigned to it.
/// If both of indices are pre-defined and refer to some operands, then the
/// method simply returns true if the corresponding operands are commutable
/// and returns false otherwise.
///
/// For example, calling this method this way:
/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
/// findCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
/// Returns true if the routine could find two commutable operands
/// in the given FMA instruction \p MI. Otherwise, returns false.
///
/// \p SrcOpIdx1 and \p SrcOpIdx2 are INPUT and OUTPUT arguments.
/// The output indices of the commuted operands are returned in these
/// arguments. Also, the input values of these arguments may be preset either
/// to indices of operands that must be commuted or be equal to a special
/// value 'CommuteAnyOperandIndex' which means that the corresponding
/// operand index is not set and this method is free to pick any of
/// available commutable operands.
/// The parameter \p FMA3Group keeps the reference to the group of relative
/// FMA3 opcodes including register/memory forms of 132/213/231 opcodes.
///
/// For example, calling this method this way:
/// unsigned Idx1 = 1, Idx2 = CommuteAnyOperandIndex;
/// findFMA3CommutedOpIndices(MI, Idx1, Idx2, FMA3Group);
/// can be interpreted as a query asking if the operand #1 can be swapped
/// with any other available operand (e.g. operand #2, operand #3, etc.).
///
/// The returned FMA opcode may differ from the opcode in the given MI.
/// For example, commuting the operands #1 and #3 in the following FMA
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
bool findFMA3CommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const;
/// Returns an adjusted FMA opcode that must be used in FMA instruction that
/// performs the same computations as the given \p MI but which has the
/// operands \p SrcOpIdx1 and \p SrcOpIdx2 commuted.
/// It may return 0 if it is unsafe to commute the operands.
/// Note that a machine instruction (instead of its opcode) is passed as the
/// first parameter to make it possible to analyze the instruction's uses and
/// commute the first operand of FMA even when it seems unsafe when you look
/// at the opcode. For example, it is Ok to commute the first operand of
/// VFMADD*SD_Int, if ONLY the lowest 64-bit element of the result is used.
///
/// The returned FMA opcode may differ from the opcode in the given \p MI.
/// For example, commuting the operands #1 and #3 in the following FMA
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
unsigned SrcOpIdx1,
unsigned SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const;
// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
- bool isUnconditionalTailCall(const MachineInstr &MI) const override;
- bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
- const MachineInstr &TailCall) const override;
- void replaceBranchWithTailCall(MachineBasicBlock &MBB,
- SmallVectorImpl<MachineOperand> &Cond,
- const MachineInstr &TailCall) const override;
-
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
MachineBasicBlock *&FBB,
SmallVectorImpl<MachineOperand> &Cond,
bool AllowModify) const override;
bool getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
int64_t &Offset,
const TargetRegisterInfo *TRI) const override;
bool analyzeBranchPredicate(MachineBasicBlock &MBB,
TargetInstrInfo::MachineBranchPredicate &MBP,
bool AllowModify = false) const override;
unsigned removeBranch(MachineBasicBlock &MBB,
int *BytesRemoved = nullptr) const override;
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
unsigned, unsigned, int&, int&, int&) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DstReg,
ArrayRef<MachineOperand> Cond, unsigned TrueReg,
unsigned FalseReg) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned SrcReg, bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
SmallVectorImpl<MachineOperand> &Addr,
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
unsigned DestReg, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
SmallVectorImpl<MachineOperand> &Addr,
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
SmallVectorImpl<MachineInstr*> &NewMIs) const;
bool expandPostRAPseudo(MachineInstr &MI) const override;
/// Check whether the target can fold a load that feeds a subreg operand
/// (or a subreg operand that feeds a store).
bool isSubregFoldable() const override { return true; }
/// foldMemoryOperand - If this target supports it, fold a load or store of
/// the specified stack slot into the specified machine instruction for the
/// specified operand(s). If this is possible, the target should perform the
/// folding and return true, otherwise it should return false. If it folds
/// the instruction, it is likely that the MachineInstruction the iterator
/// references has been changed.
MachineInstr *
foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, int FrameIndex,
LiveIntervals *LIS = nullptr) const override;
/// foldMemoryOperand - Same as the previous version except it allows folding
/// of any load and store from / to any address, not just from a specific
/// stack slot.
MachineInstr *foldMemoryOperandImpl(
MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
MachineBasicBlock::iterator InsertPt, MachineInstr &LoadMI,
LiveIntervals *LIS = nullptr) const override;
/// unfoldMemoryOperand - Separate a single instruction which folded a load or
/// a store or a load and a store into two or more instruction. If this is
/// possible, returns true as well as the new instructions by reference.
bool
unfoldMemoryOperand(MachineFunction &MF, MachineInstr &MI, unsigned Reg,
bool UnfoldLoad, bool UnfoldStore,
SmallVectorImpl<MachineInstr *> &NewMIs) const override;
bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
SmallVectorImpl<SDNode*> &NewNodes) const override;
/// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
/// instruction after load / store are unfolded from an instruction of the
/// specified opcode. It returns zero if the specified unfolding is not
/// possible. If LoadRegIndex is non-null, it is filled in with the operand
/// index of the operand which will hold the register holding the loaded
/// value.
unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
bool UnfoldLoad, bool UnfoldStore,
unsigned *LoadRegIndex = nullptr) const override;
/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
/// to determine if two loads are loading from the same base address. It
/// should only return true if the base pointers are the same and the
/// only differences between the two addresses are the offset. It also returns
/// the offsets by reference.
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, int64_t &Offset1,
int64_t &Offset2) const override;
/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
/// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
/// be scheduled togther. On some targets if two loads are loading from
/// addresses in the same cache line, it's better if they are scheduled
/// together. This function takes two integers that represent the load offsets
/// from the common base address. It returns true if it decides it's desirable
/// to schedule the two loads together. "NumLoads" is the number of loads that
/// have already been scheduled after Load1.
bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
int64_t Offset1, int64_t Offset2,
unsigned NumLoads) const override;
bool shouldScheduleAdjacent(const MachineInstr &First,
const MachineInstr &Second) const override;
void getNoopForMachoTarget(MCInst &NopInst) const override;
bool
reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
/// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
/// instruction that defines the specified register class.
bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
/// would clobber the EFLAGS condition register. Note the result may be
/// conservative. If it cannot definitely determine the safety after visiting
/// a few instructions in each direction it assumes it's not safe.
bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I) const;
/// True if MI has a condition code def, e.g. EFLAGS, that is
/// not marked dead.
bool hasLiveCondCodeDef(MachineInstr &MI) const;
/// getGlobalBaseReg - Return a virtual register initialized with the
/// the global base register value. Output instructions required to
/// initialize the register in the function entry block, if necessary.
///
unsigned getGlobalBaseReg(MachineFunction *MF) const;
std::pair<uint16_t, uint16_t>
getExecutionDomain(const MachineInstr &MI) const override;
void setExecutionDomain(MachineInstr &MI, unsigned Domain) const override;
unsigned
getPartialRegUpdateClearance(const MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
unsigned getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
const TargetRegisterInfo *TRI) const override;
void breakPartialRegDependency(MachineInstr &MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const override;
MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
unsigned OpNum,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
unsigned Size, unsigned Alignment,
bool AllowCommute) const;
bool isHighLatencyDef(int opc) const override;
bool hasHighOperandLatency(const TargetSchedModel &SchedModel,
const MachineRegisterInfo *MRI,
const MachineInstr &DefMI, unsigned DefIdx,
const MachineInstr &UseMI,
unsigned UseIdx) const override;
bool useMachineCombiner() const override {
return true;
}
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
bool hasReassociableOperands(const MachineInstr &Inst,
const MachineBasicBlock *MBB) const override;
void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
MachineInstr &NewMI1,
MachineInstr &NewMI2) const override;
/// analyzeCompare - For a comparison instruction, return the source registers
/// in SrcReg and SrcReg2 if having two register operands, and the value it
/// compares against in CmpValue. Return true if the comparison instruction
/// can be analyzed.
bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
unsigned &SrcReg2, int &CmpMask,
int &CmpValue) const override;
/// optimizeCompareInstr - Check if there exists an earlier instruction that
/// operates on the same source operands and sets flags in the same way as
/// Compare; remove Compare if possible.
bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
unsigned SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const override;
/// optimizeLoadInstr - Try to remove the load by folding it to a register
/// operand at the use. We fold the load instructions if and only if the
/// def and use are in the same BB. We only look at one load and see
/// whether it can be folded into MI. FoldAsLoadDefReg is the virtual register
/// defined by the load we are trying to fold. DefMI returns the machine
/// instruction that defines FoldAsLoadDefReg, and the function returns
/// the machine instruction generated due to folding.
MachineInstr *optimizeLoadInstr(MachineInstr &MI,
const MachineRegisterInfo *MRI,
unsigned &FoldAsLoadDefReg,
MachineInstr *&DefMI) const override;
std::pair<unsigned, unsigned>
decomposeMachineOperandsTargetFlags(unsigned TF) const override;
ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const override;
bool isTailCall(const MachineInstr &Inst) const override;
protected:
/// Commutes the operands in the given instruction by changing the operands
/// order and/or changing the instruction's opcode and/or the immediate value
/// operand.
///
/// The arguments 'CommuteOpIdx1' and 'CommuteOpIdx2' specify the operands
/// to be commuted.
///
/// Do not call this method for a non-commutable instruction or
/// non-commutable operands.
/// Even though the instruction is commutable, the method may still
/// fail to commute the operands, null pointer is returned in such cases.
MachineInstr *commuteInstructionImpl(MachineInstr &MI, bool NewMI,
unsigned CommuteOpIdx1,
unsigned CommuteOpIdx2) const override;
private:
MachineInstr *convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
MachineInstr &MI,
LiveVariables *LV) const;
/// Handles memory folding for special case instructions, for instance those
/// requiring custom manipulation of the address.
MachineInstr *foldMemoryOperandCustom(MachineFunction &MF, MachineInstr &MI,
unsigned OpNum,
ArrayRef<MachineOperand> MOs,
MachineBasicBlock::iterator InsertPt,
unsigned Size, unsigned Align) const;
/// isFrameOperand - Return true and the FrameIndex if the specified
/// operand and follow operands form a reference to the stack frame.
bool isFrameOperand(const MachineInstr &MI, unsigned int Op,
int &FrameIndex) const;
/// Returns true iff the routine could find two commutable operands in the
/// given machine instruction with 3 vector inputs.
/// The 'SrcOpIdx1' and 'SrcOpIdx2' are INPUT and OUTPUT arguments. Their
/// input values can be re-defined in this method only if the input values
/// are not pre-defined, which is designated by the special value
/// 'CommuteAnyOperandIndex' assigned to it.
/// If both of indices are pre-defined and refer to some operands, then the
/// method simply returns true if the corresponding operands are commutable
/// and returns false otherwise.
///
/// For example, calling this method this way:
/// unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
/// findThreeSrcCommutedOpIndices(MI, Op1, Op2);
/// can be interpreted as a query asking to find an operand that would be
/// commutable with the operand#1.
bool findThreeSrcCommutedOpIndices(const MachineInstr &MI,
unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const;
};
} // End llvm namespace
#endif
Index: projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp
===================================================================
--- projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 313642)
+++ projects/clang400-import/contrib/llvm/lib/Target/X86/X86MCInstLower.cpp (revision 313643)
@@ -1,1745 +1,1738 @@
//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains code to lower X86 MachineInstrs to their corresponding
// MCInst records.
//
//===----------------------------------------------------------------------===//
#include "X86AsmPrinter.h"
#include "X86RegisterInfo.h"
#include "X86ShuffleDecodeConstantPool.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "Utils/X86ShuffleDecode.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/ELF.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
namespace {
/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
class X86MCInstLower {
MCContext &Ctx;
const MachineFunction &MF;
const TargetMachine &TM;
const MCAsmInfo &MAI;
X86AsmPrinter &AsmPrinter;
public:
X86MCInstLower(const MachineFunction &MF, X86AsmPrinter &asmprinter);
Optional<MCOperand> LowerMachineOperand(const MachineInstr *MI,
const MachineOperand &MO) const;
void Lower(const MachineInstr *MI, MCInst &OutMI) const;
MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
private:
MachineModuleInfoMachO &getMachOMMI() const;
};
} // end anonymous namespace
// Emit a minimal sequence of nops spanning NumBytes bytes.
static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI);
void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
const MCSubtargetInfo &STI,
MCCodeEmitter *CodeEmitter) {
if (InShadow) {
SmallString<256> Code;
SmallVector<MCFixup, 4> Fixups;
raw_svector_ostream VecOS(Code);
CodeEmitter->encodeInstruction(Inst, VecOS, Fixups, STI);
CurrentShadowSize += Code.size();
if (CurrentShadowSize >= RequiredShadowSize)
InShadow = false; // The shadow is big enough. Stop counting.
}
}
void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
if (InShadow && CurrentShadowSize < RequiredShadowSize) {
InShadow = false;
EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
}
}
void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
}
X86MCInstLower::X86MCInstLower(const MachineFunction &mf,
X86AsmPrinter &asmprinter)
: Ctx(mf.getContext()), MF(mf), TM(mf.getTarget()), MAI(*TM.getMCAsmInfo()),
AsmPrinter(asmprinter) {}
MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
return MF.getMMI().getObjFileInfo<MachineModuleInfoMachO>();
}
/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
/// operand to an MCSymbol.
MCSymbol *X86MCInstLower::
GetSymbolFromOperand(const MachineOperand &MO) const {
const DataLayout &DL = MF.getDataLayout();
assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
MCSymbol *Sym = nullptr;
SmallString<128> Name;
StringRef Suffix;
switch (MO.getTargetFlags()) {
case X86II::MO_DLLIMPORT:
// Handle dllimport linkage.
Name += "__imp_";
break;
case X86II::MO_DARWIN_NONLAZY:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
Suffix = "$non_lazy_ptr";
break;
}
if (!Suffix.empty())
Name += DL.getPrivateGlobalPrefix();
if (MO.isGlobal()) {
const GlobalValue *GV = MO.getGlobal();
AsmPrinter.getNameWithPrefix(Name, GV);
} else if (MO.isSymbol()) {
Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
} else if (MO.isMBB()) {
assert(Suffix.empty());
Sym = MO.getMBB()->getSymbol();
}
Name += Suffix;
if (!Sym)
Sym = Ctx.getOrCreateSymbol(Name);
// If the target flags on the operand changes the name of the symbol, do that
// before we return the symbol.
switch (MO.getTargetFlags()) {
default: break;
case X86II::MO_DARWIN_NONLAZY:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
MachineModuleInfoImpl::StubValueTy &StubSym =
getMachOMMI().getGVStubEntry(Sym);
if (!StubSym.getPointer()) {
assert(MO.isGlobal() && "Extern symbol not handled yet");
StubSym =
MachineModuleInfoImpl::
StubValueTy(AsmPrinter.getSymbol(MO.getGlobal()),
!MO.getGlobal()->hasInternalLinkage());
}
break;
}
}
return Sym;
}
MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
MCSymbol *Sym) const {
// FIXME: We would like an efficient form for this, so we don't have to do a
// lot of extra uniquing.
const MCExpr *Expr = nullptr;
MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
switch (MO.getTargetFlags()) {
default: llvm_unreachable("Unknown target flag on GV operand");
case X86II::MO_NO_FLAG: // No flag.
// These affect the name of the symbol, not any suffix.
case X86II::MO_DARWIN_NONLAZY:
case X86II::MO_DLLIMPORT:
break;
case X86II::MO_TLVP: RefKind = MCSymbolRefExpr::VK_TLVP; break;
case X86II::MO_TLVP_PIC_BASE:
Expr = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_TLVP, Ctx);
// Subtract the pic base.
Expr = MCBinaryExpr::createSub(Expr,
MCSymbolRefExpr::create(MF.getPICBaseSymbol(),
Ctx),
Ctx);
break;
case X86II::MO_SECREL: RefKind = MCSymbolRefExpr::VK_SECREL; break;
case X86II::MO_TLSGD: RefKind = MCSymbolRefExpr::VK_TLSGD; break;
case X86II::MO_TLSLD: RefKind = MCSymbolRefExpr::VK_TLSLD; break;
case X86II::MO_TLSLDM: RefKind = MCSymbolRefExpr::VK_TLSLDM; break;
case X86II::MO_GOTTPOFF: RefKind = MCSymbolRefExpr::VK_GOTTPOFF; break;
case X86II::MO_INDNTPOFF: RefKind = MCSymbolRefExpr::VK_INDNTPOFF; break;
case X86II::MO_TPOFF: RefKind = MCSymbolRefExpr::VK_TPOFF; break;
case X86II::MO_DTPOFF: RefKind = MCSymbolRefExpr::VK_DTPOFF; break;
case X86II::MO_NTPOFF: RefKind = MCSymbolRefExpr::VK_NTPOFF; break;
case X86II::MO_GOTNTPOFF: RefKind = MCSymbolRefExpr::VK_GOTNTPOFF; break;
case X86II::MO_GOTPCREL: RefKind = MCSymbolRefExpr::VK_GOTPCREL; break;
case X86II::MO_GOT: RefKind = MCSymbolRefExpr::VK_GOT; break;
case X86II::MO_GOTOFF: RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
case X86II::MO_PLT: RefKind = MCSymbolRefExpr::VK_PLT; break;
case X86II::MO_PIC_BASE_OFFSET:
case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
Expr = MCSymbolRefExpr::create(Sym, Ctx);
// Subtract the pic base.
Expr = MCBinaryExpr::createSub(Expr,
MCSymbolRefExpr::create(MF.getPICBaseSymbol(), Ctx),
Ctx);
if (MO.isJTI()) {
assert(MAI.doesSetDirectiveSuppressReloc());
// If .set directive is supported, use it to reduce the number of
// relocations the assembler will generate for differences between
// local labels. This is only safe when the symbols are in the same
// section so we are restricting it to jumptable references.
MCSymbol *Label = Ctx.createTempSymbol();
AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
Expr = MCSymbolRefExpr::create(Label, Ctx);
}
break;
}
if (!Expr)
Expr = MCSymbolRefExpr::create(Sym, RefKind, Ctx);
if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
Expr = MCBinaryExpr::createAdd(Expr,
MCConstantExpr::create(MO.getOffset(), Ctx),
Ctx);
return MCOperand::createExpr(Expr);
}
/// \brief Simplify FOO $imm, %{al,ax,eax,rax} to FOO $imm, for instruction with
/// a short fixed-register form.
static void SimplifyShortImmForm(MCInst &Inst, unsigned Opcode) {
unsigned ImmOp = Inst.getNumOperands() - 1;
assert(Inst.getOperand(0).isReg() &&
(Inst.getOperand(ImmOp).isImm() || Inst.getOperand(ImmOp).isExpr()) &&
((Inst.getNumOperands() == 3 && Inst.getOperand(1).isReg() &&
Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg()) ||
Inst.getNumOperands() == 2) && "Unexpected instruction!");
// Check whether the destination register can be fixed.
unsigned Reg = Inst.getOperand(0).getReg();
if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
return;
// If so, rewrite the instruction.
MCOperand Saved = Inst.getOperand(ImmOp);
Inst = MCInst();
Inst.setOpcode(Opcode);
Inst.addOperand(Saved);
}
/// \brief If a movsx instruction has a shorter encoding for the used register
/// simplify the instruction to use it instead.
static void SimplifyMOVSX(MCInst &Inst) {
unsigned NewOpcode = 0;
unsigned Op0 = Inst.getOperand(0).getReg(), Op1 = Inst.getOperand(1).getReg();
switch (Inst.getOpcode()) {
default:
llvm_unreachable("Unexpected instruction!");
case X86::MOVSX16rr8: // movsbw %al, %ax --> cbtw
if (Op0 == X86::AX && Op1 == X86::AL)
NewOpcode = X86::CBW;
break;
case X86::MOVSX32rr16: // movswl %ax, %eax --> cwtl
if (Op0 == X86::EAX && Op1 == X86::AX)
NewOpcode = X86::CWDE;
break;
case X86::MOVSX64rr32: // movslq %eax, %rax --> cltq
if (Op0 == X86::RAX && Op1 == X86::EAX)
NewOpcode = X86::CDQE;
break;
}
if (NewOpcode != 0) {
Inst = MCInst();
Inst.setOpcode(NewOpcode);
}
}
/// \brief Simplify things like MOV32rm to MOV32o32a.
static void SimplifyShortMoveForm(X86AsmPrinter &Printer, MCInst &Inst,
unsigned Opcode) {
// Don't make these simplifications in 64-bit mode; other assemblers don't
// perform them because they make the code larger.
if (Printer.getSubtarget().is64Bit())
return;
bool IsStore = Inst.getOperand(0).isReg() && Inst.getOperand(1).isReg();
unsigned AddrBase = IsStore;
unsigned RegOp = IsStore ? 0 : 5;
unsigned AddrOp = AddrBase + 3;
assert(Inst.getNumOperands() == 6 && Inst.getOperand(RegOp).isReg() &&
Inst.getOperand(AddrBase + X86::AddrBaseReg).isReg() &&
Inst.getOperand(AddrBase + X86::AddrScaleAmt).isImm() &&
Inst.getOperand(AddrBase + X86::AddrIndexReg).isReg() &&
Inst.getOperand(AddrBase + X86::AddrSegmentReg).isReg() &&
(Inst.getOperand(AddrOp).isExpr() ||
Inst.getOperand(AddrOp).isImm()) &&
"Unexpected instruction!");
// Check whether the destination register can be fixed.
unsigned Reg = Inst.getOperand(RegOp).getReg();
if (Reg != X86::AL && Reg != X86::AX && Reg != X86::EAX && Reg != X86::RAX)
return;
// Check whether this is an absolute address.
// FIXME: We know TLVP symbol refs aren't, but there should be a better way
// to do this here.
bool Absolute = true;
if (Inst.getOperand(AddrOp).isExpr()) {
const MCExpr *MCE = Inst.getOperand(AddrOp).getExpr();
if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(MCE))
if (SRE->getKind() == MCSymbolRefExpr::VK_TLVP)
Absolute = false;
}
if (Absolute &&
(Inst.getOperand(AddrBase + X86::AddrBaseReg).getReg() != 0 ||
Inst.getOperand(AddrBase + X86::AddrScaleAmt).getImm() != 1 ||
Inst.getOperand(AddrBase + X86::AddrIndexReg).getReg() != 0))
return;
// If so, rewrite the instruction.
MCOperand Saved = Inst.getOperand(AddrOp);
MCOperand Seg = Inst.getOperand(AddrBase + X86::AddrSegmentReg);
Inst = MCInst();
Inst.setOpcode(Opcode);
Inst.addOperand(Saved);
Inst.addOperand(Seg);
}
static unsigned getRetOpcode(const X86Subtarget &Subtarget) {
return Subtarget.is64Bit() ? X86::RETQ : X86::RETL;
}
Optional<MCOperand>
X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
const MachineOperand &MO) const {
switch (MO.getType()) {
default:
MI->dump();
llvm_unreachable("unknown operand type");
case MachineOperand::MO_Register:
// Ignore all implicit register operands.
if (MO.isImplicit())
return None;
return MCOperand::createReg(MO.getReg());
case MachineOperand::MO_Immediate:
return MCOperand::createImm(MO.getImm());
case MachineOperand::MO_MachineBasicBlock:
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ExternalSymbol:
return LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
case MachineOperand::MO_MCSymbol:
return LowerSymbolOperand(MO, MO.getMCSymbol());
case MachineOperand::MO_JumpTableIndex:
return LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
case MachineOperand::MO_ConstantPoolIndex:
return LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
case MachineOperand::MO_BlockAddress:
return LowerSymbolOperand(
MO, AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
case MachineOperand::MO_RegisterMask:
// Ignore call clobbers.
return None;
}
}
void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.setOpcode(MI->getOpcode());
for (const MachineOperand &MO : MI->operands())
if (auto MaybeMCOp = LowerMachineOperand(MI, MO))
OutMI.addOperand(MaybeMCOp.getValue());
// Handle a few special cases to eliminate operand modifiers.
ReSimplify:
switch (OutMI.getOpcode()) {
case X86::LEA64_32r:
case X86::LEA64r:
case X86::LEA16r:
case X86::LEA32r:
// LEA should have a segment register, but it must be empty.
assert(OutMI.getNumOperands() == 1+X86::AddrNumOperands &&
"Unexpected # of LEA operands");
assert(OutMI.getOperand(1+X86::AddrSegmentReg).getReg() == 0 &&
"LEA has segment specified!");
break;
// Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
// if one of the registers is extended, but other isn't.
case X86::VMOVZPQILo2PQIrr:
case X86::VMOVAPDrr:
case X86::VMOVAPDYrr:
case X86::VMOVAPSrr:
case X86::VMOVAPSYrr:
case X86::VMOVDQArr:
case X86::VMOVDQAYrr:
case X86::VMOVDQUrr:
case X86::VMOVDQUYrr:
case X86::VMOVUPDrr:
case X86::VMOVUPDYrr:
case X86::VMOVUPSrr:
case X86::VMOVUPSYrr: {
if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg())) {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::VMOVZPQILo2PQIrr: NewOpc = X86::VMOVPQI2QIrr; break;
case X86::VMOVAPDrr: NewOpc = X86::VMOVAPDrr_REV; break;
case X86::VMOVAPDYrr: NewOpc = X86::VMOVAPDYrr_REV; break;
case X86::VMOVAPSrr: NewOpc = X86::VMOVAPSrr_REV; break;
case X86::VMOVAPSYrr: NewOpc = X86::VMOVAPSYrr_REV; break;
case X86::VMOVDQArr: NewOpc = X86::VMOVDQArr_REV; break;
case X86::VMOVDQAYrr: NewOpc = X86::VMOVDQAYrr_REV; break;
case X86::VMOVDQUrr: NewOpc = X86::VMOVDQUrr_REV; break;
case X86::VMOVDQUYrr: NewOpc = X86::VMOVDQUYrr_REV; break;
case X86::VMOVUPDrr: NewOpc = X86::VMOVUPDrr_REV; break;
case X86::VMOVUPDYrr: NewOpc = X86::VMOVUPDYrr_REV; break;
case X86::VMOVUPSrr: NewOpc = X86::VMOVUPSrr_REV; break;
case X86::VMOVUPSYrr: NewOpc = X86::VMOVUPSYrr_REV; break;
}
OutMI.setOpcode(NewOpc);
}
break;
}
case X86::VMOVSDrr:
case X86::VMOVSSrr: {
if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(0).getReg()) &&
X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::VMOVSDrr: NewOpc = X86::VMOVSDrr_REV; break;
case X86::VMOVSSrr: NewOpc = X86::VMOVSSrr_REV; break;
}
OutMI.setOpcode(NewOpc);
}
break;
}
// TAILJMPr64, CALL64r, CALL64pcrel32 - These instructions have register
// inputs modeled as normal uses instead of implicit uses. As such, truncate
// off all but the first operand (the callee). FIXME: Change isel.
case X86::TAILJMPr64:
case X86::TAILJMPr64_REX:
case X86::CALL64r:
case X86::CALL64pcrel32: {
unsigned Opcode = OutMI.getOpcode();
MCOperand Saved = OutMI.getOperand(0);
OutMI = MCInst();
OutMI.setOpcode(Opcode);
OutMI.addOperand(Saved);
break;
}
case X86::EH_RETURN:
case X86::EH_RETURN64: {
OutMI = MCInst();
OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
break;
}
case X86::CLEANUPRET: {
// Replace CATCHRET with the appropriate RET.
OutMI = MCInst();
OutMI.setOpcode(getRetOpcode(AsmPrinter.getSubtarget()));
break;
}
case X86::CATCHRET: {
// Replace CATCHRET with the appropriate RET.
const X86Subtarget &Subtarget = AsmPrinter.getSubtarget();
unsigned ReturnReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
OutMI = MCInst();
OutMI.setOpcode(getRetOpcode(Subtarget));
OutMI.addOperand(MCOperand::createReg(ReturnReg));
break;
}
- // TAILJMPd, TAILJMPd64, TailJMPd_cc - Lower to the correct jump instruction.
+ // TAILJMPd, TAILJMPd64 - Lower to the correct jump instruction.
{ unsigned Opcode;
case X86::TAILJMPr: Opcode = X86::JMP32r; goto SetTailJmpOpcode;
case X86::TAILJMPd:
case X86::TAILJMPd64: Opcode = X86::JMP_1; goto SetTailJmpOpcode;
- case X86::TAILJMPd_CC:
- case X86::TAILJMPd64_CC:
- Opcode = X86::GetCondBranchFromCond(
- static_cast<X86::CondCode>(MI->getOperand(1).getImm()));
- goto SetTailJmpOpcode;
SetTailJmpOpcode:
MCOperand Saved = OutMI.getOperand(0);
OutMI = MCInst();
OutMI.setOpcode(Opcode);
OutMI.addOperand(Saved);
break;
}
case X86::DEC16r:
case X86::DEC32r:
case X86::INC16r:
case X86::INC32r:
// If we aren't in 64-bit mode we can use the 1-byte inc/dec instructions.
if (!AsmPrinter.getSubtarget().is64Bit()) {
unsigned Opcode;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::DEC16r: Opcode = X86::DEC16r_alt; break;
case X86::DEC32r: Opcode = X86::DEC32r_alt; break;
case X86::INC16r: Opcode = X86::INC16r_alt; break;
case X86::INC32r: Opcode = X86::INC32r_alt; break;
}
OutMI.setOpcode(Opcode);
}
break;
// These are pseudo-ops for OR to help with the OR->ADD transformation. We do
// this with an ugly goto in case the resultant OR uses EAX and needs the
// short form.
case X86::ADD16rr_DB: OutMI.setOpcode(X86::OR16rr); goto ReSimplify;
case X86::ADD32rr_DB: OutMI.setOpcode(X86::OR32rr); goto ReSimplify;
case X86::ADD64rr_DB: OutMI.setOpcode(X86::OR64rr); goto ReSimplify;
case X86::ADD16ri_DB: OutMI.setOpcode(X86::OR16ri); goto ReSimplify;
case X86::ADD32ri_DB: OutMI.setOpcode(X86::OR32ri); goto ReSimplify;
case X86::ADD64ri32_DB: OutMI.setOpcode(X86::OR64ri32); goto ReSimplify;
case X86::ADD16ri8_DB: OutMI.setOpcode(X86::OR16ri8); goto ReSimplify;
case X86::ADD32ri8_DB: OutMI.setOpcode(X86::OR32ri8); goto ReSimplify;
case X86::ADD64ri8_DB: OutMI.setOpcode(X86::OR64ri8); goto ReSimplify;
// Atomic load and store require a separate pseudo-inst because Acquire
// implies mayStore and Release implies mayLoad; fix these to regular MOV
// instructions here
case X86::ACQUIRE_MOV8rm: OutMI.setOpcode(X86::MOV8rm); goto ReSimplify;
case X86::ACQUIRE_MOV16rm: OutMI.setOpcode(X86::MOV16rm); goto ReSimplify;
case X86::ACQUIRE_MOV32rm: OutMI.setOpcode(X86::MOV32rm); goto ReSimplify;
case X86::ACQUIRE_MOV64rm: OutMI.setOpcode(X86::MOV64rm); goto ReSimplify;
case X86::RELEASE_MOV8mr: OutMI.setOpcode(X86::MOV8mr); goto ReSimplify;
case X86::RELEASE_MOV16mr: OutMI.setOpcode(X86::MOV16mr); goto ReSimplify;
case X86::RELEASE_MOV32mr: OutMI.setOpcode(X86::MOV32mr); goto ReSimplify;
case X86::RELEASE_MOV64mr: OutMI.setOpcode(X86::MOV64mr); goto ReSimplify;
case X86::RELEASE_MOV8mi: OutMI.setOpcode(X86::MOV8mi); goto ReSimplify;
case X86::RELEASE_MOV16mi: OutMI.setOpcode(X86::MOV16mi); goto ReSimplify;
case X86::RELEASE_MOV32mi: OutMI.setOpcode(X86::MOV32mi); goto ReSimplify;
case X86::RELEASE_MOV64mi32: OutMI.setOpcode(X86::MOV64mi32); goto ReSimplify;
case X86::RELEASE_ADD8mi: OutMI.setOpcode(X86::ADD8mi); goto ReSimplify;
case X86::RELEASE_ADD8mr: OutMI.setOpcode(X86::ADD8mr); goto ReSimplify;
case X86::RELEASE_ADD32mi: OutMI.setOpcode(X86::ADD32mi); goto ReSimplify;
case X86::RELEASE_ADD32mr: OutMI.setOpcode(X86::ADD32mr); goto ReSimplify;
case X86::RELEASE_ADD64mi32: OutMI.setOpcode(X86::ADD64mi32); goto ReSimplify;
case X86::RELEASE_ADD64mr: OutMI.setOpcode(X86::ADD64mr); goto ReSimplify;
case X86::RELEASE_AND8mi: OutMI.setOpcode(X86::AND8mi); goto ReSimplify;
case X86::RELEASE_AND8mr: OutMI.setOpcode(X86::AND8mr); goto ReSimplify;
case X86::RELEASE_AND32mi: OutMI.setOpcode(X86::AND32mi); goto ReSimplify;
case X86::RELEASE_AND32mr: OutMI.setOpcode(X86::AND32mr); goto ReSimplify;
case X86::RELEASE_AND64mi32: OutMI.setOpcode(X86::AND64mi32); goto ReSimplify;
case X86::RELEASE_AND64mr: OutMI.setOpcode(X86::AND64mr); goto ReSimplify;
case X86::RELEASE_OR8mi: OutMI.setOpcode(X86::OR8mi); goto ReSimplify;
case X86::RELEASE_OR8mr: OutMI.setOpcode(X86::OR8mr); goto ReSimplify;
case X86::RELEASE_OR32mi: OutMI.setOpcode(X86::OR32mi); goto ReSimplify;
case X86::RELEASE_OR32mr: OutMI.setOpcode(X86::OR32mr); goto ReSimplify;
case X86::RELEASE_OR64mi32: OutMI.setOpcode(X86::OR64mi32); goto ReSimplify;
case X86::RELEASE_OR64mr: OutMI.setOpcode(X86::OR64mr); goto ReSimplify;
case X86::RELEASE_XOR8mi: OutMI.setOpcode(X86::XOR8mi); goto ReSimplify;
case X86::RELEASE_XOR8mr: OutMI.setOpcode(X86::XOR8mr); goto ReSimplify;
case X86::RELEASE_XOR32mi: OutMI.setOpcode(X86::XOR32mi); goto ReSimplify;
case X86::RELEASE_XOR32mr: OutMI.setOpcode(X86::XOR32mr); goto ReSimplify;
case X86::RELEASE_XOR64mi32: OutMI.setOpcode(X86::XOR64mi32); goto ReSimplify;
case X86::RELEASE_XOR64mr: OutMI.setOpcode(X86::XOR64mr); goto ReSimplify;
case X86::RELEASE_INC8m: OutMI.setOpcode(X86::INC8m); goto ReSimplify;
case X86::RELEASE_INC16m: OutMI.setOpcode(X86::INC16m); goto ReSimplify;
case X86::RELEASE_INC32m: OutMI.setOpcode(X86::INC32m); goto ReSimplify;
case X86::RELEASE_INC64m: OutMI.setOpcode(X86::INC64m); goto ReSimplify;
case X86::RELEASE_DEC8m: OutMI.setOpcode(X86::DEC8m); goto ReSimplify;
case X86::RELEASE_DEC16m: OutMI.setOpcode(X86::DEC16m); goto ReSimplify;
case X86::RELEASE_DEC32m: OutMI.setOpcode(X86::DEC32m); goto ReSimplify;
case X86::RELEASE_DEC64m: OutMI.setOpcode(X86::DEC64m); goto ReSimplify;
// We don't currently select the correct instruction form for instructions
// which have a short %eax, etc. form. Handle this by custom lowering, for
// now.
//
// Note, we are currently not handling the following instructions:
// MOV64ao8, MOV64o8a
// XCHG16ar, XCHG32ar, XCHG64ar
case X86::MOV8mr_NOREX:
case X86::MOV8mr:
case X86::MOV8rm_NOREX:
case X86::MOV8rm:
case X86::MOV16mr:
case X86::MOV16rm:
case X86::MOV32mr:
case X86::MOV32rm: {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::MOV8mr_NOREX:
case X86::MOV8mr: NewOpc = X86::MOV8o32a; break;
case X86::MOV8rm_NOREX:
case X86::MOV8rm: NewOpc = X86::MOV8ao32; break;
case X86::MOV16mr: NewOpc = X86::MOV16o32a; break;
case X86::MOV16rm: NewOpc = X86::MOV16ao32; break;
case X86::MOV32mr: NewOpc = X86::MOV32o32a; break;
case X86::MOV32rm: NewOpc = X86::MOV32ao32; break;
}
SimplifyShortMoveForm(AsmPrinter, OutMI, NewOpc);
break;
}
case X86::ADC8ri: case X86::ADC16ri: case X86::ADC32ri: case X86::ADC64ri32:
case X86::ADD8ri: case X86::ADD16ri: case X86::ADD32ri: case X86::ADD64ri32:
case X86::AND8ri: case X86::AND16ri: case X86::AND32ri: case X86::AND64ri32:
case X86::CMP8ri: case X86::CMP16ri: case X86::CMP32ri: case X86::CMP64ri32:
case X86::OR8ri: case X86::OR16ri: case X86::OR32ri: case X86::OR64ri32:
case X86::SBB8ri: case X86::SBB16ri: case X86::SBB32ri: case X86::SBB64ri32:
case X86::SUB8ri: case X86::SUB16ri: case X86::SUB32ri: case X86::SUB64ri32:
case X86::TEST8ri:case X86::TEST16ri:case X86::TEST32ri:case X86::TEST64ri32:
case X86::XOR8ri: case X86::XOR16ri: case X86::XOR32ri: case X86::XOR64ri32: {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::ADC8ri: NewOpc = X86::ADC8i8; break;
case X86::ADC16ri: NewOpc = X86::ADC16i16; break;
case X86::ADC32ri: NewOpc = X86::ADC32i32; break;
case X86::ADC64ri32: NewOpc = X86::ADC64i32; break;
case X86::ADD8ri: NewOpc = X86::ADD8i8; break;
case X86::ADD16ri: NewOpc = X86::ADD16i16; break;
case X86::ADD32ri: NewOpc = X86::ADD32i32; break;
case X86::ADD64ri32: NewOpc = X86::ADD64i32; break;
case X86::AND8ri: NewOpc = X86::AND8i8; break;
case X86::AND16ri: NewOpc = X86::AND16i16; break;
case X86::AND32ri: NewOpc = X86::AND32i32; break;
case X86::AND64ri32: NewOpc = X86::AND64i32; break;
case X86::CMP8ri: NewOpc = X86::CMP8i8; break;
case X86::CMP16ri: NewOpc = X86::CMP16i16; break;
case X86::CMP32ri: NewOpc = X86::CMP32i32; break;
case X86::CMP64ri32: NewOpc = X86::CMP64i32; break;
case X86::OR8ri: NewOpc = X86::OR8i8; break;
case X86::OR16ri: NewOpc = X86::OR16i16; break;
case X86::OR32ri: NewOpc = X86::OR32i32; break;
case X86::OR64ri32: NewOpc = X86::OR64i32; break;
case X86::SBB8ri: NewOpc = X86::SBB8i8; break;
case X86::SBB16ri: NewOpc = X86::SBB16i16; break;
case X86::SBB32ri: NewOpc = X86::SBB32i32; break;
case X86::SBB64ri32: NewOpc = X86::SBB64i32; break;
case X86::SUB8ri: NewOpc = X86::SUB8i8; break;
case X86::SUB16ri: NewOpc = X86::SUB16i16; break;
case X86::SUB32ri: NewOpc = X86::SUB32i32; break;
case X86::SUB64ri32: NewOpc = X86::SUB64i32; break;
case X86::TEST8ri: NewOpc = X86::TEST8i8; break;
case X86::TEST16ri: NewOpc = X86::TEST16i16; break;
case X86::TEST32ri: NewOpc = X86::TEST32i32; break;
case X86::TEST64ri32: NewOpc = X86::TEST64i32; break;
case X86::XOR8ri: NewOpc = X86::XOR8i8; break;
case X86::XOR16ri: NewOpc = X86::XOR16i16; break;
case X86::XOR32ri: NewOpc = X86::XOR32i32; break;
case X86::XOR64ri32: NewOpc = X86::XOR64i32; break;
}
SimplifyShortImmForm(OutMI, NewOpc);
break;
}
// Try to shrink some forms of movsx.
case X86::MOVSX16rr8:
case X86::MOVSX32rr16:
case X86::MOVSX64rr32:
SimplifyMOVSX(OutMI);
break;
}
}
void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
const MachineInstr &MI) {
bool is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
MI.getOpcode() == X86::TLS_base_addr64;
bool needsPadding = MI.getOpcode() == X86::TLS_addr64;
MCContext &context = OutStreamer->getContext();
if (needsPadding)
EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
MCSymbolRefExpr::VariantKind SRVK;
switch (MI.getOpcode()) {
case X86::TLS_addr32:
case X86::TLS_addr64:
SRVK = MCSymbolRefExpr::VK_TLSGD;
break;
case X86::TLS_base_addr32:
SRVK = MCSymbolRefExpr::VK_TLSLDM;
break;
case X86::TLS_base_addr64:
SRVK = MCSymbolRefExpr::VK_TLSLD;
break;
default:
llvm_unreachable("unexpected opcode");
}
MCSymbol *sym = MCInstLowering.GetSymbolFromOperand(MI.getOperand(3));
const MCSymbolRefExpr *symRef = MCSymbolRefExpr::create(sym, SRVK, context);
MCInst LEA;
if (is64Bits) {
LEA.setOpcode(X86::LEA64r);
LEA.addOperand(MCOperand::createReg(X86::RDI)); // dest
LEA.addOperand(MCOperand::createReg(X86::RIP)); // base
LEA.addOperand(MCOperand::createImm(1)); // scale
LEA.addOperand(MCOperand::createReg(0)); // index
LEA.addOperand(MCOperand::createExpr(symRef)); // disp
LEA.addOperand(MCOperand::createReg(0)); // seg
} else if (SRVK == MCSymbolRefExpr::VK_TLSLDM) {
LEA.setOpcode(X86::LEA32r);
LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
LEA.addOperand(MCOperand::createReg(X86::EBX)); // base
LEA.addOperand(MCOperand::createImm(1)); // scale
LEA.addOperand(MCOperand::createReg(0)); // index
LEA.addOperand(MCOperand::createExpr(symRef)); // disp
LEA.addOperand(MCOperand::createReg(0)); // seg
} else {
LEA.setOpcode(X86::LEA32r);
LEA.addOperand(MCOperand::createReg(X86::EAX)); // dest
LEA.addOperand(MCOperand::createReg(0)); // base
LEA.addOperand(MCOperand::createImm(1)); // scale
LEA.addOperand(MCOperand::createReg(X86::EBX)); // index
LEA.addOperand(MCOperand::createExpr(symRef)); // disp
LEA.addOperand(MCOperand::createReg(0)); // seg
}
EmitAndCountInstruction(LEA);
if (needsPadding) {
EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
EmitAndCountInstruction(MCInstBuilder(X86::DATA16_PREFIX));
EmitAndCountInstruction(MCInstBuilder(X86::REX64_PREFIX));
}
StringRef name = is64Bits ? "__tls_get_addr" : "___tls_get_addr";
MCSymbol *tlsGetAddr = context.getOrCreateSymbol(name);
const MCSymbolRefExpr *tlsRef =
MCSymbolRefExpr::create(tlsGetAddr,
MCSymbolRefExpr::VK_PLT,
context);
EmitAndCountInstruction(MCInstBuilder(is64Bits ? X86::CALL64pcrel32
: X86::CALLpcrel32)
.addExpr(tlsRef));
}
/// \brief Emit the largest nop instruction smaller than or equal to \p NumBytes
/// bytes. Return the size of nop emitted.
static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI) {
// This works only for 64bit. For 32bit we have to do additional checking if
// the CPU supports multi-byte nops.
assert(Is64Bit && "EmitNops only supports X86-64");
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
Opc = IndexReg = Displacement = SegmentReg = 0;
BaseReg = X86::RAX;
ScaleVal = 1;
switch (NumBytes) {
case 0: llvm_unreachable("Zero nops?"); break;
case 1: NopSize = 1; Opc = X86::NOOP; break;
case 2: NopSize = 2; Opc = X86::XCHG16ar; break;
case 3: NopSize = 3; Opc = X86::NOOPL; break;
case 4: NopSize = 4; Opc = X86::NOOPL; Displacement = 8; break;
case 5: NopSize = 5; Opc = X86::NOOPL; Displacement = 8;
IndexReg = X86::RAX; break;
case 6: NopSize = 6; Opc = X86::NOOPW; Displacement = 8;
IndexReg = X86::RAX; break;
case 7: NopSize = 7; Opc = X86::NOOPL; Displacement = 512; break;
case 8: NopSize = 8; Opc = X86::NOOPL; Displacement = 512;
IndexReg = X86::RAX; break;
case 9: NopSize = 9; Opc = X86::NOOPW; Displacement = 512;
IndexReg = X86::RAX; break;
default: NopSize = 10; Opc = X86::NOOPW; Displacement = 512;
IndexReg = X86::RAX; SegmentReg = X86::CS; break;
}
unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
NopSize += NumPrefixes;
for (unsigned i = 0; i != NumPrefixes; ++i)
OS.EmitBytes("\x66");
switch (Opc) {
default:
llvm_unreachable("Unexpected opcode");
break;
case X86::NOOP:
OS.EmitInstruction(MCInstBuilder(Opc), STI);
break;
case X86::XCHG16ar:
OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX), STI);
break;
case X86::NOOPL:
case X86::NOOPW:
OS.EmitInstruction(MCInstBuilder(Opc)
.addReg(BaseReg)
.addImm(ScaleVal)
.addReg(IndexReg)
.addImm(Displacement)
.addReg(SegmentReg),
STI);
break;
}
assert(NopSize <= NumBytes && "We overemitted?");
return NopSize;
}
/// \brief Emit the optimal amount of multi-byte nops on X86.
static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI) {
unsigned NopsToEmit = NumBytes;
(void)NopsToEmit;
while (NumBytes) {
NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
}
}
void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
StatepointOpers SOpers(&MI);
if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
getSubtargetInfo());
} else {
// Lower call target and choose correct opcode
const MachineOperand &CallTarget = SOpers.getCallTarget();
MCOperand CallTargetMCOp;
unsigned CallOpcode;
switch (CallTarget.getType()) {
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ExternalSymbol:
CallTargetMCOp = MCIL.LowerSymbolOperand(
CallTarget, MCIL.GetSymbolFromOperand(CallTarget));
CallOpcode = X86::CALL64pcrel32;
// Currently, we only support relative addressing with statepoints.
// Otherwise, we'll need a scratch register to hold the target
// address. You'll fail asserts during load & relocation if this
// symbol is to far away. (TODO: support non-relative addressing)
break;
case MachineOperand::MO_Immediate:
CallTargetMCOp = MCOperand::createImm(CallTarget.getImm());
CallOpcode = X86::CALL64pcrel32;
// Currently, we only support relative addressing with statepoints.
// Otherwise, we'll need a scratch register to hold the target
// immediate. You'll fail asserts during load & relocation if this
// address is to far away. (TODO: support non-relative addressing)
break;
case MachineOperand::MO_Register:
CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
CallOpcode = X86::CALL64r;
break;
default:
llvm_unreachable("Unsupported operand type in statepoint call target");
break;
}
// Emit call
MCInst CallInst;
CallInst.setOpcode(CallOpcode);
CallInst.addOperand(CallTargetMCOp);
OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
}
// Record our statepoint node in the same section used by STACKMAP
// and PATCHPOINT
SM.recordStatepoint(MI);
}
void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
unsigned LoadDefRegister = MI.getOperand(0).getReg();
MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
unsigned LoadOpcode = MI.getOperand(2).getImm();
unsigned LoadOperandsBeginIdx = 3;
FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
MCInst LoadMI;
LoadMI.setOpcode(LoadOpcode);
if (LoadDefRegister != X86::NoRegister)
LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
E = MI.operands_end();
I != E; ++I)
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
LoadMI.addOperand(MaybeOperand.getValue());
OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
}
void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// PATCHABLE_OP minsize, opcode, operands
unsigned MinSize = MI.getOperand(0).getImm();
unsigned Opcode = MI.getOperand(1).getImm();
MCInst MCI;
MCI.setOpcode(Opcode);
for (auto &MO : make_range(MI.operands_begin() + 2, MI.operands_end()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
MCI.addOperand(MaybeOperand.getValue());
SmallString<256> Code;
SmallVector<MCFixup, 4> Fixups;
raw_svector_ostream VecOS(Code);
CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
if (Code.size() < MinSize) {
if (MinSize == 2 && Opcode == X86::PUSH64r) {
// This is an optimization that lets us get away without emitting a nop in
// many cases.
//
// NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
// bytes too, so the check on MinSize is important.
MCI.setOpcode(X86::PUSH64rmr);
} else {
unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
getSubtargetInfo());
assert(NopSize == MinSize && "Could not implement MinSize!");
(void) NopSize;
}
}
OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
}
// Lower a stackmap of the form:
// <id>, <shadowBytes>, ...
void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
SM.recordStackMap(MI);
unsigned NumShadowBytes = MI.getOperand(1).getImm();
SMShadowTracker.reset(NumShadowBytes);
}
// Lower a patchpoint of the form:
// [<def>], <id>, <numBytes>, <target>, <numArgs>, <cc>, ...
void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "Patchpoint currently only supports X86-64");
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
SM.recordPatchPoint(MI);
PatchPointOpers opers(&MI);
unsigned ScratchIdx = opers.getNextScratchIdx();
unsigned EncodedBytes = 0;
const MachineOperand &CalleeMO = opers.getCallTarget();
// Check for null target. If target is non-null (i.e. is non-zero or is
// symbolic) then emit a call.
if (!(CalleeMO.isImm() && !CalleeMO.getImm())) {
MCOperand CalleeMCOp;
switch (CalleeMO.getType()) {
default:
/// FIXME: Add a verifier check for bad callee types.
llvm_unreachable("Unrecognized callee operand type.");
case MachineOperand::MO_Immediate:
if (CalleeMO.getImm())
CalleeMCOp = MCOperand::createImm(CalleeMO.getImm());
break;
case MachineOperand::MO_ExternalSymbol:
case MachineOperand::MO_GlobalAddress:
CalleeMCOp =
MCIL.LowerSymbolOperand(CalleeMO,
MCIL.GetSymbolFromOperand(CalleeMO));
break;
}
// Emit MOV to materialize the target address and the CALL to target.
// This is encoded with 12-13 bytes, depending on which register is used.
unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg();
if (X86II::isX86_64ExtendedReg(ScratchReg))
EncodedBytes = 13;
else
EncodedBytes = 12;
EmitAndCountInstruction(
MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
}
// Emit padding.
unsigned NumBytes = opers.getNumPatchBytes();
assert(NumBytes >= EncodedBytes &&
"Patchpoint can't request size less than the length of a call.");
EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
getSubtargetInfo());
}
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// We want to emit the following pattern:
//
// .p2align 1, ...
// .Lxray_sled_N:
// jmp .tmpN
// # 9 bytes worth of noops
// .tmpN
//
// We need the 9 bytes because at runtime, we'd be patching over the full 11
// bytes with the following pattern:
//
// mov %r10, <function id, 32-bit> // 6 bytes
// call <relative offset, 32-bits> // 5 bytes
//
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
auto Target = OutContext.createTempSymbol();
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
OutStreamer->EmitBytes("\xeb\x09");
EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
OutStreamer->EmitLabel(Target);
recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
}
void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// Since PATCHABLE_RET takes the opcode of the return statement as an
// argument, we use that to emit the correct form of the RET that we want.
// i.e. when we see this:
//
// PATCHABLE_RET X86::RET ...
//
// We should emit the RET followed by sleds.
//
// .p2align 1, ...
// .Lxray_sled_N:
// ret # or equivalent instruction
// # 10 bytes worth of noops
//
// This just makes sure that the alignment for the next instruction is 2.
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
unsigned OpCode = MI.getOperand(0).getImm();
MCInst Ret;
Ret.setOpcode(OpCode);
for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
Ret.addOperand(MaybeOperand.getValue());
OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
}
void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) {
// Like PATCHABLE_RET, we have the actual instruction in the operands to this
// instruction so we lower that particular instruction and its operands.
// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
// we do it for PATCHABLE_FUNCTION_ENTER. The sled should be very similar to
// the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
// tail call much like how we have it in PATCHABLE_RET.
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
auto Target = OutContext.createTempSymbol();
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
OutStreamer->EmitBytes("\xeb\x09");
EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
OutStreamer->EmitLabel(Target);
recordSled(CurSled, MI, SledKind::TAIL_CALL);
unsigned OpCode = MI.getOperand(0).getImm();
MCInst TC;
TC.setOpcode(OpCode);
// Before emitting the instruction, add a comment to indicate that this is
// indeed a tail call.
OutStreamer->AddComment("TAILCALL");
for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
TC.addOperand(MaybeOperand.getValue());
OutStreamer->EmitInstruction(TC, getSubtargetInfo());
}
// Returns instruction preceding MBBI in MachineFunction.
// If MBBI is the first instruction of the first basic block, returns null.
static MachineBasicBlock::const_iterator
PrevCrossBBInst(MachineBasicBlock::const_iterator MBBI) {
const MachineBasicBlock *MBB = MBBI->getParent();
while (MBBI == MBB->begin()) {
if (MBB == &MBB->getParent()->front())
return MachineBasicBlock::const_iterator();
MBB = MBB->getPrevNode();
MBBI = MBB->end();
}
return --MBBI;
}
static const Constant *getConstantFromPool(const MachineInstr &MI,
const MachineOperand &Op) {
if (!Op.isCPI())
return nullptr;
ArrayRef<MachineConstantPoolEntry> Constants =
MI.getParent()->getParent()->getConstantPool()->getConstants();
const MachineConstantPoolEntry &ConstantEntry =
Constants[Op.getIndex()];
// Bail if this is a machine constant pool entry, we won't be able to dig out
// anything useful.
if (ConstantEntry.isMachineConstantPoolEntry())
return nullptr;
auto *C = dyn_cast<Constant>(ConstantEntry.Val.ConstVal);
assert((!C || ConstantEntry.getType() == C->getType()) &&
"Expected a constant of the same type!");
return C;
}
static std::string getShuffleComment(const MachineInstr *MI,
unsigned SrcOp1Idx,
unsigned SrcOp2Idx,
ArrayRef<int> Mask) {
std::string Comment;
// Compute the name for a register. This is really goofy because we have
// multiple instruction printers that could (in theory) use different
// names. Fortunately most people use the ATT style (outside of Windows)
// and they actually agree on register naming here. Ultimately, this is
// a comment, and so its OK if it isn't perfect.
auto GetRegisterName = [](unsigned RegNum) -> StringRef {
return X86ATTInstPrinter::getRegisterName(RegNum);
};
const MachineOperand &DstOp = MI->getOperand(0);
const MachineOperand &SrcOp1 = MI->getOperand(SrcOp1Idx);
const MachineOperand &SrcOp2 = MI->getOperand(SrcOp2Idx);
StringRef DstName = DstOp.isReg() ? GetRegisterName(DstOp.getReg()) : "mem";
StringRef Src1Name =
SrcOp1.isReg() ? GetRegisterName(SrcOp1.getReg()) : "mem";
StringRef Src2Name =
SrcOp2.isReg() ? GetRegisterName(SrcOp2.getReg()) : "mem";
// One source operand, fix the mask to print all elements in one span.
SmallVector<int, 8> ShuffleMask(Mask.begin(), Mask.end());
if (Src1Name == Src2Name)
for (int i = 0, e = ShuffleMask.size(); i != e; ++i)
if (ShuffleMask[i] >= e)
ShuffleMask[i] -= e;
raw_string_ostream CS(Comment);
CS << DstName;
// Handle AVX512 MASK/MASXZ write mask comments.
// MASK: zmmX {%kY}
// MASKZ: zmmX {%kY} {z}
if (SrcOp1Idx > 1) {
assert((SrcOp1Idx == 2 || SrcOp1Idx == 3) && "Unexpected writemask");
const MachineOperand &WriteMaskOp = MI->getOperand(SrcOp1Idx - 1);
if (WriteMaskOp.isReg()) {
CS << " {%" << GetRegisterName(WriteMaskOp.getReg()) << "}";
if (SrcOp1Idx == 2) {
CS << " {z}";
}
}
}
CS << " = ";
for (int i = 0, e = ShuffleMask.size(); i != e; ++i) {
if (i != 0)
CS << ",";
if (ShuffleMask[i] == SM_SentinelZero) {
CS << "zero";
continue;
}
// Otherwise, it must come from src1 or src2. Print the span of elements
// that comes from this src.
bool isSrc1 = ShuffleMask[i] < (int)e;
CS << (isSrc1 ? Src1Name : Src2Name) << '[';
bool IsFirst = true;
while (i != e && ShuffleMask[i] != SM_SentinelZero &&
(ShuffleMask[i] < (int)e) == isSrc1) {
if (!IsFirst)
CS << ',';
else
IsFirst = false;
if (ShuffleMask[i] == SM_SentinelUndef)
CS << "u";
else
CS << ShuffleMask[i] % (int)e;
++i;
}
CS << ']';
--i; // For loop increments element #.
}
CS.flush();
return Comment;
}
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(*MF, *this);
const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
// Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
// are compressed from EVEX encoding to VEX encoding.
if (TM.Options.MCOptions.ShowMCEncoding) {
if (MI->getAsmPrinterFlags() & AC_EVEX_2_VEX)
OutStreamer->AddComment("EVEX TO VEX Compression ", false);
}
switch (MI->getOpcode()) {
case TargetOpcode::DBG_VALUE:
llvm_unreachable("Should be handled target independently");
// Emit nothing here but a comment if we can.
case X86::Int_MemBarrier:
OutStreamer->emitRawComment("MEMBARRIER");
return;
case X86::EH_RETURN:
case X86::EH_RETURN64: {
// Lower these as normal, but add some comments.
unsigned Reg = MI->getOperand(0).getReg();
OutStreamer->AddComment(StringRef("eh_return, addr: %") +
X86ATTInstPrinter::getRegisterName(Reg));
break;
}
case X86::CLEANUPRET: {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("CLEANUPRET");
break;
}
case X86::CATCHRET: {
// Lower these as normal, but add some comments.
OutStreamer->AddComment("CATCHRET");
break;
}
case X86::TAILJMPr:
case X86::TAILJMPm:
case X86::TAILJMPd:
- case X86::TAILJMPd_CC:
case X86::TAILJMPr64:
case X86::TAILJMPm64:
case X86::TAILJMPd64:
- case X86::TAILJMPd64_CC:
case X86::TAILJMPr64_REX:
case X86::TAILJMPm64_REX:
// Lower these as normal, but add some comments.
OutStreamer->AddComment("TAILCALL");
break;
case X86::TLS_addr32:
case X86::TLS_addr64:
case X86::TLS_base_addr32:
case X86::TLS_base_addr64:
return LowerTlsAddr(MCInstLowering, *MI);
case X86::MOVPC32r: {
// This is a pseudo op for a two instruction sequence with a label, which
// looks like:
// call "L1$pb"
// "L1$pb":
// popl %esi
// Emit the call.
MCSymbol *PICBase = MF->getPICBaseSymbol();
// FIXME: We would like an efficient form for this, so we don't have to do a
// lot of extra uniquing.
EmitAndCountInstruction(MCInstBuilder(X86::CALLpcrel32)
.addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
const X86FrameLowering* FrameLowering =
MF->getSubtarget<X86Subtarget>().getFrameLowering();
bool hasFP = FrameLowering->hasFP(*MF);
// TODO: This is needed only if we require precise CFA.
bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
!OutStreamer->getDwarfFrameInfos().back().End;
int stackGrowth = -RI->getSlotSize();
if (HasActiveDwarfFrame && !hasFP) {
OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
}
// Emit the label.
OutStreamer->EmitLabel(PICBase);
// popl $reg
EmitAndCountInstruction(MCInstBuilder(X86::POP32r)
.addReg(MI->getOperand(0).getReg()));
if (HasActiveDwarfFrame && !hasFP) {
OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
}
return;
}
case X86::ADD32ri: {
// Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
break;
// Okay, we have something like:
// EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
// For this, we want to print something like:
// MYGLOBAL + (. - PICBASE)
// However, we can't generate a ".", so just emit a new label here and refer
// to it.
MCSymbol *DotSym = OutContext.createTempSymbol();
OutStreamer->EmitLabel(DotSym);
// Now that we have emitted the label, lower the complex operand expression.
MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
const MCExpr *PICBase =
MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
DotExpr = MCBinaryExpr::createAdd(MCSymbolRefExpr::create(OpSym,OutContext),
DotExpr, OutContext);
EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
.addReg(MI->getOperand(0).getReg())
.addReg(MI->getOperand(1).getReg())
.addExpr(DotExpr));
return;
}
case TargetOpcode::STATEPOINT:
return LowerSTATEPOINT(*MI, MCInstLowering);
case TargetOpcode::FAULTING_LOAD_OP:
return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
case TargetOpcode::PATCHABLE_OP:
return LowerPATCHABLE_OP(*MI, MCInstLowering);
case TargetOpcode::STACKMAP:
return LowerSTACKMAP(*MI);
case TargetOpcode::PATCHPOINT:
return LowerPATCHPOINT(*MI, MCInstLowering);
case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
case TargetOpcode::PATCHABLE_RET:
return LowerPATCHABLE_RET(*MI, MCInstLowering);
case TargetOpcode::PATCHABLE_TAIL_CALL:
return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
case X86::MORESTACK_RET:
EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
return;
case X86::MORESTACK_RET_RESTORE_R10:
// Return, then restore R10.
EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
.addReg(X86::R10)
.addReg(X86::RAX));
return;
case X86::SEH_PushReg:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
return;
case X86::SEH_SaveReg:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_SaveXMM:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_StackAlloc:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
return;
case X86::SEH_SetFrame:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
MI->getOperand(1).getImm());
return;
case X86::SEH_PushFrame:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
return;
case X86::SEH_EndPrologue:
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
OutStreamer->EmitWinCFIEndProlog();
return;
case X86::SEH_Epilogue: {
assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
MachineBasicBlock::const_iterator MBBI(MI);
// Check if preceded by a call and emit nop if so.
for (MBBI = PrevCrossBBInst(MBBI);
MBBI != MachineBasicBlock::const_iterator();
MBBI = PrevCrossBBInst(MBBI)) {
// Conservatively assume that pseudo instructions don't emit code and keep
// looking for a call. We may emit an unnecessary nop in some cases.
if (!MBBI->isPseudo()) {
if (MBBI->isCall())
EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
break;
}
}
return;
}
// Lower PSHUFB and VPERMILP normally but add a comment if we can find
// a constant shuffle mask. We won't be able to do this at the MC layer
// because the mask isn't an immediate.
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
case X86::VPSHUFBYrm:
case X86::VPSHUFBZ128rm:
case X86::VPSHUFBZ128rmk:
case X86::VPSHUFBZ128rmkz:
case X86::VPSHUFBZ256rm:
case X86::VPSHUFBZ256rmk:
case X86::VPSHUFBZ256rmkz:
case X86::VPSHUFBZrm:
case X86::VPSHUFBZrmk:
case X86::VPSHUFBZrmkz: {
if (!OutStreamer->isVerboseAsm())
break;
unsigned SrcIdx, MaskIdx;
switch (MI->getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::PSHUFBrm:
case X86::VPSHUFBrm:
case X86::VPSHUFBYrm:
case X86::VPSHUFBZ128rm:
case X86::VPSHUFBZ256rm:
case X86::VPSHUFBZrm:
SrcIdx = 1; MaskIdx = 5; break;
case X86::VPSHUFBZ128rmkz:
case X86::VPSHUFBZ256rmkz:
case X86::VPSHUFBZrmkz:
SrcIdx = 2; MaskIdx = 6; break;
case X86::VPSHUFBZ128rmk:
case X86::VPSHUFBZ256rmk:
case X86::VPSHUFBZrmk:
SrcIdx = 3; MaskIdx = 7; break;
}
assert(MI->getNumOperands() >= 6 &&
"We should always have at least 6 operands!");
const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 64> Mask;
DecodePSHUFBMask(C, Mask);
if (!Mask.empty())
OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
case X86::VPERMILPSrm:
case X86::VPERMILPSYrm:
case X86::VPERMILPSZ128rm:
case X86::VPERMILPSZ128rmk:
case X86::VPERMILPSZ128rmkz:
case X86::VPERMILPSZ256rm:
case X86::VPERMILPSZ256rmk:
case X86::VPERMILPSZ256rmkz:
case X86::VPERMILPSZrm:
case X86::VPERMILPSZrmk:
case X86::VPERMILPSZrmkz:
case X86::VPERMILPDrm:
case X86::VPERMILPDYrm:
case X86::VPERMILPDZ128rm:
case X86::VPERMILPDZ128rmk:
case X86::VPERMILPDZ128rmkz:
case X86::VPERMILPDZ256rm:
case X86::VPERMILPDZ256rmk:
case X86::VPERMILPDZ256rmkz:
case X86::VPERMILPDZrm:
case X86::VPERMILPDZrmk:
case X86::VPERMILPDZrmkz: {
if (!OutStreamer->isVerboseAsm())
break;
unsigned SrcIdx, MaskIdx;
unsigned ElSize;
switch (MI->getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::VPERMILPSrm:
case X86::VPERMILPSYrm:
case X86::VPERMILPSZ128rm:
case X86::VPERMILPSZ256rm:
case X86::VPERMILPSZrm:
SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
case X86::VPERMILPSZ128rmkz:
case X86::VPERMILPSZ256rmkz:
case X86::VPERMILPSZrmkz:
SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
case X86::VPERMILPSZ128rmk:
case X86::VPERMILPSZ256rmk:
case X86::VPERMILPSZrmk:
SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
case X86::VPERMILPDrm:
case X86::VPERMILPDYrm:
case X86::VPERMILPDZ128rm:
case X86::VPERMILPDZ256rm:
case X86::VPERMILPDZrm:
SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
case X86::VPERMILPDZ128rmkz:
case X86::VPERMILPDZ256rmkz:
case X86::VPERMILPDZrmkz:
SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
case X86::VPERMILPDZ128rmk:
case X86::VPERMILPDZ256rmk:
case X86::VPERMILPDZrmk:
SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
}
assert(MI->getNumOperands() >= 6 &&
"We should always have at least 6 operands!");
const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
DecodeVPERMILPMask(C, ElSize, Mask);
if (!Mask.empty())
OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
}
break;
}
case X86::VPERMIL2PDrm:
case X86::VPERMIL2PSrm:
case X86::VPERMIL2PDrmY:
case X86::VPERMIL2PSrmY: {
if (!OutStreamer->isVerboseAsm())
break;
assert(MI->getNumOperands() >= 8 &&
"We should always have at least 8 operands!");
const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
if (!CtrlOp.isImm())
break;
unsigned ElSize;
switch (MI->getOpcode()) {
default: llvm_unreachable("Invalid opcode");
case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
}
const MachineOperand &MaskOp = MI->getOperand(6);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Mask);
if (!Mask.empty())
OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
case X86::VPPERMrrm: {
if (!OutStreamer->isVerboseAsm())
break;
assert(MI->getNumOperands() >= 7 &&
"We should always have at least 7 operands!");
const MachineOperand &MaskOp = MI->getOperand(6);
if (auto *C = getConstantFromPool(*MI, MaskOp)) {
SmallVector<int, 16> Mask;
DecodeVPPERMMask(C, Mask);
if (!Mask.empty())
OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
}
break;
}
#define MOV_CASE(Prefix, Suffix) \
case X86::Prefix##MOVAPD##Suffix##rm: \
case X86::Prefix##MOVAPS##Suffix##rm: \
case X86::Prefix##MOVUPD##Suffix##rm: \
case X86::Prefix##MOVUPS##Suffix##rm: \
case X86::Prefix##MOVDQA##Suffix##rm: \
case X86::Prefix##MOVDQU##Suffix##rm:
#define MOV_AVX512_CASE(Suffix) \
case X86::VMOVDQA64##Suffix##rm: \
case X86::VMOVDQA32##Suffix##rm: \
case X86::VMOVDQU64##Suffix##rm: \
case X86::VMOVDQU32##Suffix##rm: \
case X86::VMOVDQU16##Suffix##rm: \
case X86::VMOVDQU8##Suffix##rm: \
case X86::VMOVAPS##Suffix##rm: \
case X86::VMOVAPD##Suffix##rm: \
case X86::VMOVUPS##Suffix##rm: \
case X86::VMOVUPD##Suffix##rm:
#define CASE_ALL_MOV_RM() \
MOV_CASE(, ) /* SSE */ \
MOV_CASE(V, ) /* AVX-128 */ \
MOV_CASE(V, Y) /* AVX-256 */ \
MOV_AVX512_CASE(Z) \
MOV_AVX512_CASE(Z256) \
MOV_AVX512_CASE(Z128)
// For loads from a constant pool to a vector register, print the constant
// loaded.
CASE_ALL_MOV_RM()
if (!OutStreamer->isVerboseAsm())
break;
if (MI->getNumOperands() <= 4)
break;
if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
std::string Comment;
raw_string_ostream CS(Comment);
const MachineOperand &DstOp = MI->getOperand(0);
CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
CS << "[";
for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) {
if (i != 0)
CS << ",";
if (CDS->getElementType()->isIntegerTy())
CS << CDS->getElementAsInteger(i);
else if (CDS->getElementType()->isFloatTy())
CS << CDS->getElementAsFloat(i);
else if (CDS->getElementType()->isDoubleTy())
CS << CDS->getElementAsDouble(i);
else
CS << "?";
}
CS << "]";
OutStreamer->AddComment(CS.str());
} else if (auto *CV = dyn_cast<ConstantVector>(C)) {
CS << "<";
for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) {
if (i != 0)
CS << ",";
Constant *COp = CV->getOperand(i);
if (isa<UndefValue>(COp)) {
CS << "u";
} else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
if (CI->getBitWidth() <= 64) {
CS << CI->getZExtValue();
} else {
// print multi-word constant as (w0,w1)
const auto &Val = CI->getValue();
CS << "(";
for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
if (i > 0)
CS << ",";
CS << Val.getRawData()[i];
}
CS << ")";
}
} else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
SmallString<32> Str;
CF->getValueAPF().toString(Str);
CS << Str;
} else {
CS << "?";
}
}
CS << ">";
OutStreamer->AddComment(CS.str());
}
}
break;
}
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
// in to the stackmap shadow. The only way to achieve this is if the call
// is at the end of the shadow.
if (MI->isCall()) {
// Count then size of the call towards the shadow
SMShadowTracker.count(TmpInst, getSubtargetInfo(), CodeEmitter.get());
// Then flush the shadow so that we fill with nops before the call, not
// after it.
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
// Then emit the call
OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
return;
}
EmitAndCountInstruction(TmpInst);
}
Index: projects/clang400-import/contrib/llvm/tools/clang
===================================================================
--- projects/clang400-import/contrib/llvm/tools/clang (revision 313642)
+++ projects/clang400-import/contrib/llvm/tools/clang (revision 313643)
Property changes on: projects/clang400-import/contrib/llvm/tools/clang
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/clang/dist:r313300-313642
Index: projects/clang400-import/contrib/llvm/tools/lld
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lld (revision 313642)
+++ projects/clang400-import/contrib/llvm/tools/lld (revision 313643)
Property changes on: projects/clang400-import/contrib/llvm/tools/lld
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lld/dist:r313300-313642
Index: projects/clang400-import/contrib/llvm/tools/lldb
===================================================================
--- projects/clang400-import/contrib/llvm/tools/lldb (revision 313642)
+++ projects/clang400-import/contrib/llvm/tools/lldb (revision 313643)
Property changes on: projects/clang400-import/contrib/llvm/tools/lldb
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/lldb/dist:r313300-313642
Index: projects/clang400-import/contrib/llvm
===================================================================
--- projects/clang400-import/contrib/llvm (revision 313642)
+++ projects/clang400-import/contrib/llvm (revision 313643)
Property changes on: projects/clang400-import/contrib/llvm
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
Merged /vendor/llvm/dist:r313300-313642
Index: projects/clang400-import/lib/clang/include/clang/Basic/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 313642)
+++ projects/clang400-import/lib/clang/include/clang/Basic/Version.inc (revision 313643)
@@ -1,11 +1,11 @@
/* $FreeBSD$ */
#define CLANG_VERSION 4.0.0
#define CLANG_VERSION_STRING "4.0.0"
#define CLANG_VERSION_MAJOR 4
#define CLANG_VERSION_MINOR 0
#define CLANG_VERSION_PATCHLEVEL 0
#define CLANG_VENDOR "FreeBSD "
-#define SVN_REVISION "294123"
+#define SVN_REVISION "294803"
Index: projects/clang400-import/lib/clang/include/lld/Config/Version.inc
===================================================================
--- projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 313642)
+++ projects/clang400-import/lib/clang/include/lld/Config/Version.inc (revision 313643)
@@ -1,8 +1,8 @@
// $FreeBSD$
#define LLD_VERSION 4.0.0
#define LLD_VERSION_STRING "4.0.0"
#define LLD_VERSION_MAJOR 4
#define LLD_VERSION_MINOR 0
-#define LLD_REVISION_STRING "294123"
+#define LLD_REVISION_STRING "294803"
#define LLD_REPOSITORY_STRING "FreeBSD"

File Metadata

Mime Type
text/x-c++
Expires
Thu, Mar 19, 6:31 AM (2 d)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
39/fa/4da7c987d8c412f61443efbad14e
Default Alt Text
(1 MB)

Event Timeline